diff --git a/app.py b/app.py index 76be6bb5fa9103091688d197be6c5c55f8dd64a7..d7ea4bcfdebce4f6dabf2b095392c24d8cddd30e 100644 --- a/app.py +++ b/app.py @@ -55,7 +55,8 @@ with gr.Blocks() as block: ) # Define different captions for each table - default_caption = "**Table 1: MEGA-Bench full results.** The number in the parentheses is the number of tasks of each keyword.
The Core set contains $N_{\\text{core}} = 440$ tasks evaluated by rule-based metrics, and the Open-ended set contains $N_{\\text{open}} = 65$ tasks evaluated by a VLM judge (we use GPT-4o-0806).
$\\text{Overall} \\ = \\ \\frac{\\max(\\text{Core w/o CoT}, \\ \\text{Core w/ CoT}) \\ \\cdot \\ N_{\\text{core}} \\ + \\ \\text{Open-ended} \\ \\cdot \\ N_{\\text{open}}}{N_{\\text{core}} \\ + \\ N_{\\text{open}}}$" + default_caption = "**Table 1: MEGA-Bench full results.** The number in the parentheses is the number of tasks of each keyword.
The Core set contains $N_{\\text{core}} = 440$ tasks evaluated by rule-based metrics, and the Open-ended set contains $N_{\\text{open}} = 65$ tasks evaluated by a VLM judge (we use GPT-4o-0806).
Different from the results in our paper, we only use the Core results with CoT prompting here for clarity and compatibility with the released data.
$\\text{Overall} \\ = \\ \\frac{\\text{Core} \\ \\cdot \\ N_{\\text{core}} \\ + \\ \\text{Open-ended} \\ \\cdot \\ N_{\\text{open}}}{N_{\\text{core}} \\ + \\ N_{\\text{open}}}$ " + single_image_caption = "**Table 2: MEGA-Bench Single-image setting results.** The number in the parentheses is the number of tasks in each keyword.
This subset contains 273 single-image tasks from the Core set and 42 single-image tasks from the Open-ended set. For open-source models, we drop the image input in the 1-shot demonstration example so that the entire query contains a single image only.
Compared to the default table, some models with only single-image support are added." caption_component = gr.Markdown( diff --git a/constants.py b/constants.py index 5ede1ed201bf80874ccae05283aa8832056c6e10..d6e2eb11c7d02f59ef39d5e3be9127709089b65f 100644 --- a/constants.py +++ b/constants.py @@ -28,7 +28,7 @@ We aim to provide cost-effective and accurate evaluation for multimodal models, ## 📊🔍 Results & Takeaways from Evaluating Top Models -- GPT-4o (0513) and Claude 3.5 Sonnet (1022) lead the benchmark. Claude 3.5 Sonnet (1022) improves over Claude 3.5 Sonnet (0622) obviously in planning tasks (application dimension) and UI/Infographics inputs (input format dimension). +- GPT-4o (0513) and Claude 3.5 Sonnet (1022) lead the benchmark. Claude 3.5 Sonnet (1022) improves over Claude 3.5 Sonnet (0620) obviously in planning tasks (application dimension) and UI/Infographics inputs (input format dimension). - Qwen2-VL stands out among open-source models, and its flagship model gets close to some proprietary flagship models - Chain-of-Thought (CoT) prompting improves proprietary models but has limited impact on open-source models - Gemini 1.5 Flash performs the best among all the evaluated efficiency models, but struggles with UI and document tasks diff --git a/static/eval_results/Default/Aquila_VL_2B/summary_results.json b/static/eval_results/Default/Aquila_VL_2B/summary_results.json new file mode 100644 index 0000000000000000000000000000000000000000..8aaeeec492de6dabf76847d0cb433cab957a2f9d --- /dev/null +++ b/static/eval_results/Default/Aquila_VL_2B/summary_results.json @@ -0,0 +1,251 @@ +{ + "model_summary": { + "core": { + "num_eval_tasks": 440, + "num_eval_samples": 6539, + "macro_mean_score": 0.159970161379836, + "micro_mean_score": 0.15844711671722148 + }, + "open": { + "num_eval_tasks": 65, + "num_eval_samples": 1163, + "macro_mean_score": 0.24567572098570653, + "micro_mean_score": 0.2704213241616509 + }, + "overall_score": 0.17100157004197775 + }, + "keyword_stats": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.1796551584774396 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.1263506560912463 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.1775085349123463 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.2114933522881099 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.16251700109869488 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.26453155444796583 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.3729498746867168 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.19090788408036002 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.16500679466160564 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.03972686819521137 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.07035116566014021 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.11915109312705179 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.18915652635850314 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.21939978337316163 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.17643260913333875 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.2438396314831894 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.08989401697906672 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.12241197113963243 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.10758402844431432 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.19372082302321905 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.19201243810115767 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.23278612647548963 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.21664527852608348 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.12138133030990172 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.01221681479628382 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.17994400163273605 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.21939978337316163 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.18212149746318507 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.21563163558700174 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.0981320856519089 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.0557399538308785 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.1351126472094214 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.2025034827431662 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.29326275059361956 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.22529225586731416 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.23810497886903373 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.17867138975396438 + } + } + } +} \ No newline at end of file diff --git a/static/eval_results/Default/Aquila_VL_2B/task_results.json b/static/eval_results/Default/Aquila_VL_2B/task_results.json new file mode 100644 index 0000000000000000000000000000000000000000..1c82047a713cf36db47e26bd58bec56a58a4ce85 --- /dev/null +++ b/static/eval_results/Default/Aquila_VL_2B/task_results.json @@ -0,0 +1,7756 @@ +[ + { + "name": "functionality_matching_in_different_objects", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "location_vqa", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "paper_review_rating", + "score": 0.7138859642533433, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "numerical_data", + "num_input": "4-5 images" + }, + { + "name": "scibench_calculus_wo_solution", + "score": 0.04081632653061224, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_find_odd_one_out", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "google_streetview_circle_sorting", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "planning_screenshot_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "relative_depth_of_different_points", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "recipe_image_ordering", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "rocks_samples_identify", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "sign_language", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "2d_image_jigsaw_puzzle_easy", + "score": 0.20357142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "booking_web_recommendation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "signage_navigation", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "painting_QA", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "road_map_find_highway_between_two_place", + "score": 0.17647058823529413, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "video_eval_dynamic_pref", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "logical_reasoning_fit_pattern", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "medical_keywords_based_retrieval_non_radiology", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "entertainment_web_game_style", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "game_platform_support_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ascii_art_understanding", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "scibench_fundamental_wo_solution", + "score": 0.061224489795918366, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "vln_hindi_next_step", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "GUI_Act_Mobile_swipe", + "score": 0.5106271997072349, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_grounding_temporal", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "extract_webpage_headline", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "distinguish_ai_generated_image", + "score": 0.47368421052631576, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_nested_squares", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_cell_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "booking_web_rating", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "video_eval_visual_pref", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "medical_parasite_detection", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "medical_image_artifacts_indentification", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_tyreworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_overlapped_circle", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "monthly_weather_days_count", + "score": 0.2380952380952381, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_visualization_output_understanding", + "score": 0.1, + "eval_type": "rule", + "num_demo": 1, + "num_query": 10, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "CLEVRER_physics", + "score": 0.3, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Mobile_tap", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "character_recognition_in_TV_shows", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "circuit_diagram_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "music_sheet_note_count", + "score": 0.058823529411764705, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "MMMU_physics_chemistry_MCQ", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chess_find_legal_moves", + "score": 0.020071738122614136, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_translation_advanced", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "planning_visual_storage", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "multiview_reasoning_camera_moving", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "medical_counting_lymphocytes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_visual_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "code_execution", + "score": 0.0625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_sygyzy_endgames", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "video_action_recognition", + "score": 0.4642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "structured_output", + "num_input": "video" + }, + { + "name": "cultural_vqa", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "music_sheet_author", + "score": 0.1875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "rocks_samples_compare", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "comic_page_ordering", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "hashtag_recommendation", + "score": 0.65, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "multilingual_news_qa", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "paper_review_acceptance", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "counting_multi_image", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "photoshop_operation", + "score": 0.08928571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "vln_identify_robot", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "9-image or more" + }, + { + "name": "geometry_reasoning_grid", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_visual_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "annoying_word_search", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "healthcare_info_judgement", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Ethical and Safety Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "webpage_code_understanding", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "font_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "relative_reflectance_of_different_regions", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "vln_tegulu_next_step", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "medical_abdomen_endscopy_organ_recognition", + "score": 0.023809523809523808, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "calendar_schedule_suggestion", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "mensa_iq_test", + "score": 0.07352941176470588, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "google_streetview_direction_understanding", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "planning_visual_blocksworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "medical_multi_organ_segmentation_rater", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "product_ocr_qa", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "counting", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_blocksworld", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "semantic_matching_of_two_images", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "MMMU_pro_exam_screenshot", + "score": 0.0707070707070707, + "eval_type": "rule", + "num_demo": 1, + "num_query": 99, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "chinese_idiom_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_visual_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "ishihara_test", + "score": 0.2571428571428572, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "pokemon_3D_recognition", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "Ad_count_detection", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "code_translation_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "visual_correspondance_in_two_images", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "video_eval_factual_pref", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "number_comparison", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "code_translation_Python", + "score": 0.0625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "song_title_identification_from_lyrics", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "paper_vqa", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_translation_easy", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "TRANCE_physics_reasoning_event", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "polygon_interior_angles", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "play_go_capture_stone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "music_sheet_name", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "medical_content_based_retrieval_radiology", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "mindmap_elements_parsing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_segments_reordering", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "structured_output", + "num_input": "video" + }, + { + "name": "planning_screenshot_storage", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "rebus", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 23, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "music_sheet_format_QA", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "autonomous_driving_scene_analysis", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "worldle", + "score": 2.0145220870414344e-06, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "transit_map_intersection_points", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "mahjong", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "Movie_retrieval_by_actor", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "music_sheet_sentiment", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "code_retrieval", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "LaTeX_complex_formula_convertion", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_intent_recognition", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "GUI_Act_Web_Single", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "image_translation_en2cn", + "score": 0.018365235176046272, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "actor_recognition_in_Movie", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "medical_blood_vessels_recognition", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "remaining_playback_time_calculation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ancient_map_understanding", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "media_recommend_solutions_stackoverflow", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "vln_english_next_step", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "waldo", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "medical_polyp_segmentation_single_object_rater", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "planning_screenshot_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "realworld_qa_en2cn", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "soccer_offside", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "code_match_problem", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "recover_masked_word_in_figure", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "top_video_creator_identification", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "web_action_grounding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "code_solution_compare", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "arxiv_vqa", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "interpret_force_perspective_illusion", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "orchestra_score_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_grounding_spatial", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "logical_reasoning_2d_folding", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "GUI_Act_Web_Multi", + "score": 0.21776661396440047, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "llavaguard", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "google_streetview_circle_reasoning", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "TV_show_retrieval_by_character", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "icon_arithmetic_puzzle", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "vln_identify_location", + "score": 0.0606060606060606, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "planning_visual_grippers", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "geometry_reasoning_count_line_intersections", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_abdomen_MRI_organ_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "web_action_prediction", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "knowledge_graph_understanding", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "knowledge_sign_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "Forensic_Detection_of_different_images", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "position_relationship", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "Bongard_Problem", + "score": 0.10526315789473684, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_circled_letter", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_2D_views_of_3D_shapes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "dish_ingredient_match", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "media_homepage_profile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "video_camera_motion_description", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "exact_text", + "num_input": "video" + }, + { + "name": "google_streetview_line_sorting", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "move_pos_to_pos_hanoi_4_pole", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "flowchart_code_generation", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "topological_sort", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "google_streetview_line_reasoning", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "TRANCE_physics_reasoning_view", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "planning_screenshot_grippers", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_retrieval_given_surgeon_activity", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Videos", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "TRANCE_physics_reasoning_basic", + "score": 0.23529411764705882, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "code_add_tag", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "weather_map_climate_type_temperature_parsing", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "highest_discount_game_price_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "media_QA_web_stackoverflow", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "newspaper_ocr_in_query_box", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "electricity_plot_future_prediction", + "score": 0.4600736842105264, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "coco_person_detection", + "score": 0.07586052215812712, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geometry_length", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "weather_info_parsing", + "score": 0.07539682539682542, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "av_view_identification", + "score": 0.08888888888888889, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "physical_property_reasoning", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "face_keypoint_detection", + "score": 0.848854419078294, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_maxflow", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "av_vehicle_multiview_counting", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "6-8 images" + }, + { + "name": "geometry_transformation", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "chess_puzzle_single_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ili_ratio_future_prediction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geometry_analytic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "mvsa_sentiment_classification", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "graph_theory", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "clevrer_video_moving_object_property_recognition", + "score": 0.8125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "clevrer_video_moving_object_count", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 21, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "video" + }, + { + "name": "sta_action_localization_video", + "score": 0.1875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "code_programming_test_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "graph_hamiltonian_cycle", + "score": 0.33035714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "3d_indoor_scene_text_bbox_selection", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "vizwiz_quality_accessment_for_blind", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "funsd_document_qa", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "single_person_pose_estimation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "deciphering_oracle_bone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "license_plate_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "science_molecule_chemistry", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "snli_ve_visual_entailment", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "table_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geometry_solid", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "tqa_textbook_qa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "music_info_parsing", + "score": 0.08928571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_area", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "landmark_check_two_images", + "score": 0.5111111111111112, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "dvqa", + "score": 0.42105263157894735, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "cheapest_flight_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "code_programming_extremely_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Planning and Decision Making" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "widerface_face_count_and_event_classification", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "movie_info_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "long_string_letter_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chess_winner_identification", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "math_convexity_value_estimation", + "score": 0.2665237458765274, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "multi_load_type_prediction_from_plot", + "score": 0.4523809523809523, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "ti_fused_vqa_math", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "human_relationship_reasoning", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Commonsense and Social Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "vibe_eval_short_phrase", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "av_multicamera_tracking_predict_bbox", + "score": 0.034222739980969856, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "9-image or more" + }, + { + "name": "perception_test_video_character_order", + "score": 0.3125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "question_solution_solving", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "game_info_parsing", + "score": 0.21428571428571422, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "top_rated_hotel_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "nextqa_mc", + "score": 0.8947368421052632, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "stock_price_future_prediction", + "score": 0.1700714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "algebra", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "graph_shortest_path_planar", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "action_sequence", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "9-image or more" + }, + { + "name": "map_diagram_qa", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "places365_similar_scene_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "graph_shortest_path_kamada_kawai", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "star_object_interaction_video", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "graph_chordless_cycle", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "waybill_number_sequence_extraction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "iconqa", + "score": 0.3157894736842105, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "graph_connectivity", + "score": 0.18333333333333332, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "mnist_pattern", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "clevrer_moving_direction_video", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "egocentric_analysis_single_image", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "image_style_recognition", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "weather_info_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "geometry_descriptive", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "IAM_line_ocr_and_locate", + "score": 0.13601920423828534, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "electricity_load_estimate_plot", + "score": 0.09999999999999996, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "insect_order_classification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "perception_test_object_shuffle_video", + "score": 0.0625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "muma_theory_of_mind_social_goal", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "ti_fused_vqa_biology", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "youtube_video_info_parsing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "muma_theory_of_mind_belief_of_goal", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "quizlet_question_solving", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_chemistry", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "clevr_arithmetic", + "score": 0.21052631578947367, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "exchange_rate_estimate_plot", + "score": 0.3688571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "multilingual_movie_info_parsing", + "score": 0.11224489795918366, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "hotel_booking_confirmation_parsing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_programming_test_easy", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 24, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "brand_logo_recognition_and_elaboration", + "score": 0.08, + "eval_type": "rule", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "arc_agi", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "movie_info_parsing", + "score": 0.10714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "3d_indoor_scene_text_bbox_prediction", + "score": 0.04401625959886561, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "video_to_camera_trajectory_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "TV_show_info_parsing", + "score": 0.10317460317460318, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "stock_info_parsing", + "score": 0.03361344537815126, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "math_parity", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "visualdial_visual_dialog_image_guessing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "famous_building_recognition", + "score": 0.0625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "egocentric_spatial_reasoning", + "score": 0.1111111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "nlvr2_two_image_compare_qa", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "vlnqa_egocentric_navigation_video", + "score": 0.125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "graph_isomorphism", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "code_output_result", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "landmark_recognition_and_qa", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "face_identity_matching", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "4-5 images" + }, + { + "name": "music_info_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "clevrer_object_existence_video", + "score": 0.4375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "cam_traj_to_video_selection", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "video_content_reasoning", + "score": 0.1111111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "3d_fragments_understanding", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "handwritten_math_expression_extraction", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_physics", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "pmc_vqa_medical_image_qa", + "score": 0.6842105263157895, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "long_string_number_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "electricity_future_prediction_from_table", + "score": 0.3578947368421052, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "places365_scene_type_classification", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_error_line_identification", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "code_programming_test_advanced", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "super_clevr", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "animal_pose_estimation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "logo2k_same_type_logo_retrieval", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "science_basic_physics", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "kvqa_knowledge_aware_qa", + "score": 0.21052631578947367, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "math_breakpoint", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "newspaper_page_parse_and_count", + "score": 0.28888888888888886, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "multilingual_game_info_parsing", + "score": 0.11607142857142858, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "signboard_identification", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "chart_vqa", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "action_prediction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "average_humidity_estimate_plot", + "score": 0.14799999999999996, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "photo_sharing_image_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "humor_understand_caption_match", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "game_info_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "emotion_recognition", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "6-8 images" + }, + { + "name": "figureqa", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "av_human_multiview_counting", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "6-8 images" + }, + { + "name": "perception_test_video_action_count", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "video" + }, + { + "name": "stock_info_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "coco_object_detection_by_query_property", + "score": 0.30685136455043505, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "graph_hamiltonian_path", + "score": 0.4107142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "coco_ood_global_image_retrieval_by_query_property", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "traffic_future_prediction_from_line_plot", + "score": 0.4050000000000001, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_crushing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_equality", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_checkmate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_papers", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_indian_celebrity", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_chinese_celebrity", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_famous_treaty", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "reward_models_T2I_reward", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "reward_models_I2T_reward", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "crossword_mini_5x5", + "score": 0.0071428571428571435, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_motion_matching_real_3D", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "video_motion_matching_3D_real", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "number_puzzle_sudoku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "number_puzzle_kakuro_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "panel_images_single_question", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "panel_images_multi_question", + "score": 0.047619047619047616, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "table_understanding_complex_question_answering", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "table_understanding_fact_verification", + "score": 0.619047619047619, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_position_images", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_xor_images", + "score": 0.8666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_repeat_length", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_contain_length", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_multi_contain_position_only", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_length", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_contain_images", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_multi_contain_repeat", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_position_length", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_multi_contain_repeat_position_only_length", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "maze_2d_8x8", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_PolitiFact", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_GossipCop", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Object Recognition and Classification", + "Scene and Event Understanding", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_HatefulMemes", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Memotion", + "score": 0.6823529411764706, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_latex", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_csv", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_markdown", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_experience_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_employer_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_article_authors", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_article_journal", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_skill_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_text_latex", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_equation", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_html", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_school_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "pictionary_doodle_guess", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_chinese_food_img2en", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_cartoon_drawing_guess", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_genai_output_chinese", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_skribbl_io", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "counterfactual_arithmetic", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_youtube", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_iphone_settings", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_notes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_word", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_alipay", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_twitter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_leetcode", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_tiktok", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_excel", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_instagram", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_ppt", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_zoom", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_amazon", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_layout_understanding_youtube", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_iphone_settings", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_notes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_word", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_alipay", + "score": 0.058823529411764705, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_twitter", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_leetcode", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_tiktok", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_excel", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_instagram", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_ppt", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_zoom", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_amazon", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_attribute_edit", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_out_of_context", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_background_change", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_entity_replace", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_clip_stable_diffusion_generate", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_style", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_veracity", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_swap", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_blogpost", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_homepage", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_publication", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "autorater_3d_model_texturing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Metrics", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_subject", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_unmask", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_artifact_reason", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "autorater_mask", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_aesthetics", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_motion_guided_editing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "autorater_artifact", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "autorater_control", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_semantics", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "shape_composition_shapes", + "score": 0.11862244897959183, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "shape_composition_colours", + "score": 0.09268707482993196, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_math_TheoremQA", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_MATH", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "ball_cup_swap_3", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "poetry_petrarchian_sonnet_optional_meter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_shakespearean_sonnet", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_custom_rhyming_scheme", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_haiku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic_alliteration", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_limerick", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "geographic_remote_sensing_land_cover", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "visual_prediction_rater_novel_view_synthesis", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_semantic_segmentation", + "score": 0.020833333333333332, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_panoptic_segmentation", + "score": 0.023809523809523808, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_surface_normal_estimation", + "score": 0.2619047619047619, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_openable_part_segmentation", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_plane_segmentation", + "score": 0.2222222222222222, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_depth_estimation", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_3d_assembled_quality_understanding", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "symbolic_graphics_programs_computer_aided_design", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "symbolic_graphics_programs_scalable_vector_graphics", + "score": 0.1111111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_depth", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_relation", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_distance", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_count", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_africa", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_asia", + "score": 0.028571428571428574, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_americas", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_europe", + "score": 0.014285714285714287, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "figurative_speech_explanation", + "score": 0.35172413793103446, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "tweets_captioning", + "score": 0.07857142857142856, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "science_figure_explanation", + "score": 0.19310344827586204, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "iq_test", + "score": 0.22413793103448276, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "sceneqa_scene_transition_video", + "score": 0.17857142857142858, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "wikihow_complex_task_completion", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "9-image or more" + }, + { + "name": "funqa_unexpected_action_humor_video", + "score": 0.1866666666666667, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "image_captioning_with_additional_requirements", + "score": 0.4142857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "visualization_with_code", + "score": 0.014285714285714287, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocrqa", + "score": 0.3482758620689655, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "activitynetqa", + "score": 0.5157894736842106, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "generated_video_artifacts", + "score": 0.24375000000000002, + "eval_type": "llm", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "paper_review_writing", + "score": 0.060000000000000005, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "4-5 images" + }, + { + "name": "table2latex_complex", + "score": 0.25555555555555554, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "funqa_unexpected_action_creative_video", + "score": 0.22, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "graph_interpretation", + "score": 0.14482758620689654, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "defeasible_reasoning", + "score": 0.43793103448275855, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "image_humor_understanding", + "score": 0.37931034482758613, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "doc_vqa", + "score": 0.48125, + "eval_type": "llm", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "4-5 images" + }, + { + "name": "video2notes", + "score": 0.05714285714285715, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "bar_chart_interpretation", + "score": 0.20344827586206896, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_short_title", + "score": 0.21428571428571427, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "humor_explanation", + "score": 0.18666666666666673, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "nextqa_oe", + "score": 0.3578947368421052, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "vibe-eval", + "score": 0.2, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_summary", + "score": 0.12142857142857144, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "art_explanation", + "score": 0.1758620689655172, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "traffic_accident_analysis", + "score": 0.05714285714285715, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_content_follow_up", + "score": 0.03571428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Planning and Decision Making" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "electrocardiogram", + "score": 0.23571428571428574, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_qa", + "score": 0.5142857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "GUI_Chat_Easy", + "score": 0.5038461538461537, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funny_image_title", + "score": 0.5428571428571429, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "scibench_w_solution_open_ended", + "score": 0.084, + "eval_type": "llm", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "meme_explain", + "score": 0.1642857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "GUI_Chat_Hard", + "score": 0.4548387096774193, + "eval_type": "llm", + "num_demo": 1, + "num_query": 31, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funqa_unexpected_action_magic_video", + "score": 0.2533333333333333, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "unusual_images", + "score": 0.26896551724137935, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "guess_image_generation_prompt", + "score": 0.37368421052631573, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_detail_description", + "score": 0.2157894736842105, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "docci_image_description_long", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "ascii_art_30", + "score": 0.14285714285714285, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Spanish", + "score": 0.0071428571428571435, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_French", + "score": 0.0071428571428571435, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Russian", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_English", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Arabic", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Japanese", + "score": 0.028571428571428574, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "table_understanding_fetaqa", + "score": 0.12142857142857146, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_advanced", + "score": 0.0642857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_expert", + "score": 0.3428571428571429, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_worldclass", + "score": 0.07857142857142858, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Spanish", + "score": 0.06428571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_French", + "score": 0.06428571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Russian", + "score": 0.04285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Chinese", + "score": 0.09999999999999999, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Arabic", + "score": 0.028571428571428574, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "red_teaming_visual_order_B", + "score": 0.836842105263158, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "red_teaming_captcha", + "score": 0.10000000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visualmisleading", + "score": 0.8421052631578949, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_celebrity", + "score": 0.5450000000000002, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_politics", + "score": 0.61, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_jailbreak", + "score": 0.4499999999999999, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Text Recognition (OCR)", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visual_order_A", + "score": 0.8578947368421055, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "red_teaming_racial", + "score": 0.6849999999999999, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + } +] \ No newline at end of file diff --git a/static/eval_results/Default/Aria/summary_results.json b/static/eval_results/Default/Aria/summary_results.json new file mode 100644 index 0000000000000000000000000000000000000000..348d2d8b5f081312e2c2629ee53791750ebf9e42 --- /dev/null +++ b/static/eval_results/Default/Aria/summary_results.json @@ -0,0 +1,251 @@ +{ + "model_summary": { + "core": { + "num_eval_tasks": 440, + "num_eval_samples": 6539, + "macro_mean_score": 0.289073788209904, + "micro_mean_score": 0.2859007507765791 + }, + "open": { + "num_eval_tasks": 65, + "num_eval_samples": 1163, + "macro_mean_score": 0.5103725263180767, + "micro_mean_score": 0.5349957007738607 + }, + "overall_score": 0.31755778420402525 + }, + "keyword_stats": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.3153649050553317 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.34425736922415495 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.3921740378709932 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.37623282710622424 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.271674311347156 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.46313777834281344 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.5692180451127821 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.3152064038837139 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.23851147782276536 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.11246568298589892 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.28561724084490353 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.2505346698796475 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.3040414715952029 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.41865640360591405 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.3622713579911698 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.35872259826035346 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.1509096092007215 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.2846987779732631 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.2899384042262363 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.27412885527802433 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.3117275816801635 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.4523860109667709 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.310055869988487 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.18301681783824644 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.26651659725352617 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.34236220565522313 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.41865640360591405 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.19142683154129833 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.2596336265133595 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.3929243812973524 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.1403503245041943 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.25367910605102256 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.3494812758481046 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.3662927672998609 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.28616079233761366 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.3953949223279651 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.26097385403450996 + } + } + } +} \ No newline at end of file diff --git a/static/eval_results/Default/Aria/task_results.json b/static/eval_results/Default/Aria/task_results.json new file mode 100644 index 0000000000000000000000000000000000000000..c211356d1853516b2bf00bd3f361964f56fec5ed --- /dev/null +++ b/static/eval_results/Default/Aria/task_results.json @@ -0,0 +1,7756 @@ +[ + { + "name": "functionality_matching_in_different_objects", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "location_vqa", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "paper_review_rating", + "score": 0.647700584092455, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "numerical_data", + "num_input": "4-5 images" + }, + { + "name": "scibench_calculus_wo_solution", + "score": 0.12244897959183673, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_find_odd_one_out", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "google_streetview_circle_sorting", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "planning_screenshot_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "relative_depth_of_different_points", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "recipe_image_ordering", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "rocks_samples_identify", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "sign_language", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "2d_image_jigsaw_puzzle_easy", + "score": 0.26071428571428573, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "booking_web_recommendation", + "score": 0.5515873015873015, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "signage_navigation", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "painting_QA", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "road_map_find_highway_between_two_place", + "score": 0.17647058823529413, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "video_eval_dynamic_pref", + "score": 0.4375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "logical_reasoning_fit_pattern", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "medical_keywords_based_retrieval_non_radiology", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "entertainment_web_game_style", + "score": 0.75, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "game_platform_support_identification", + "score": 0.32142857142857145, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ascii_art_understanding", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "scibench_fundamental_wo_solution", + "score": 0.10204081632653061, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "vln_hindi_next_step", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "GUI_Act_Mobile_swipe", + "score": 0.5736169347206616, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_grounding_temporal", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "extract_webpage_headline", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "distinguish_ai_generated_image", + "score": 0.5263157894736842, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_nested_squares", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_cell_recognition", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "booking_web_rating", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "video_eval_visual_pref", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "medical_parasite_detection", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "medical_image_artifacts_indentification", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_tyreworld", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_overlapped_circle", + "score": 0.5357142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "monthly_weather_days_count", + "score": 0.09523809523809523, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_visualization_output_understanding", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 10, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "CLEVRER_physics", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Mobile_tap", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "character_recognition_in_TV_shows", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "circuit_diagram_understanding", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "music_sheet_note_count", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "MMMU_physics_chemistry_MCQ", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chess_find_legal_moves", + "score": 0.034980972645280155, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_translation_advanced", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "planning_visual_storage", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "multiview_reasoning_camera_moving", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "medical_counting_lymphocytes", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_visual_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "code_execution", + "score": 0.0625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_sygyzy_endgames", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "video_action_recognition", + "score": 0.75, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "structured_output", + "num_input": "video" + }, + { + "name": "cultural_vqa", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "music_sheet_author", + "score": 0.3125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "rocks_samples_compare", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "comic_page_ordering", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "hashtag_recommendation", + "score": 0.9404761904761905, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "multilingual_news_qa", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "paper_review_acceptance", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "counting_multi_image", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "photoshop_operation", + "score": 0.27976190476190477, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "vln_identify_robot", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "9-image or more" + }, + { + "name": "geometry_reasoning_grid", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_visual_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "annoying_word_search", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "healthcare_info_judgement", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Ethical and Safety Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "webpage_code_understanding", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "font_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "relative_reflectance_of_different_regions", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "vln_tegulu_next_step", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "medical_abdomen_endscopy_organ_recognition", + "score": 0.047619047619047616, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "calendar_schedule_suggestion", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "mensa_iq_test", + "score": 0.3740196078431373, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "google_streetview_direction_understanding", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "planning_visual_blocksworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "medical_multi_organ_segmentation_rater", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "product_ocr_qa", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "counting", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_blocksworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "semantic_matching_of_two_images", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "MMMU_pro_exam_screenshot", + "score": 0.1414141414141414, + "eval_type": "rule", + "num_demo": 1, + "num_query": 99, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "chinese_idiom_recognition", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_visual_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "ishihara_test", + "score": 0.18571428571428572, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "pokemon_3D_recognition", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "Ad_count_detection", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "code_translation_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "visual_correspondance_in_two_images", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "video_eval_factual_pref", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "number_comparison", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "code_translation_Python", + "score": 0.22916666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "song_title_identification_from_lyrics", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "paper_vqa", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_translation_easy", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "TRANCE_physics_reasoning_event", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "polygon_interior_angles", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "play_go_capture_stone", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "music_sheet_name", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "medical_content_based_retrieval_radiology", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "mindmap_elements_parsing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_segments_reordering", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "structured_output", + "num_input": "video" + }, + { + "name": "planning_screenshot_storage", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "rebus", + "score": 0.043478260869565216, + "eval_type": "rule", + "num_demo": 1, + "num_query": 23, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "music_sheet_format_QA", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "autonomous_driving_scene_analysis", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "worldle", + "score": 0.10270184425364004, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "transit_map_intersection_points", + "score": 0.16071428571428573, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "mahjong", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "Movie_retrieval_by_actor", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "music_sheet_sentiment", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "code_retrieval", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "LaTeX_complex_formula_convertion", + "score": 0.35294117647058826, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_intent_recognition", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "GUI_Act_Web_Single", + "score": 0.0004322891149600856, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "image_translation_en2cn", + "score": 0.3016653054893176, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "actor_recognition_in_Movie", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "medical_blood_vessels_recognition", + "score": 0.5357142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "remaining_playback_time_calculation", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ancient_map_understanding", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "media_recommend_solutions_stackoverflow", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "vln_english_next_step", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "waldo", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "medical_polyp_segmentation_single_object_rater", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "planning_screenshot_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "realworld_qa_en2cn", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "soccer_offside", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "code_match_problem", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "recover_masked_word_in_figure", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "top_video_creator_identification", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "web_action_grounding", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "code_solution_compare", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "arxiv_vqa", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "interpret_force_perspective_illusion", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "orchestra_score_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_grounding_spatial", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "logical_reasoning_2d_folding", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "GUI_Act_Web_Multi", + "score": 0.3965511477601961, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "llavaguard", + "score": 0.6071428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "google_streetview_circle_reasoning", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "TV_show_retrieval_by_character", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "icon_arithmetic_puzzle", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "vln_identify_location", + "score": 0.14545454545454545, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "planning_visual_grippers", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "geometry_reasoning_count_line_intersections", + "score": 0.4642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_abdomen_MRI_organ_recognition", + "score": 0.16666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "web_action_prediction", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "knowledge_graph_understanding", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "knowledge_sign_recognition", + "score": 0.1111111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "Forensic_Detection_of_different_images", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "position_relationship", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "Bongard_Problem", + "score": 0.21052631578947367, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_circled_letter", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_2D_views_of_3D_shapes", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "dish_ingredient_match", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "media_homepage_profile", + "score": 0.023809523809523808, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "video_camera_motion_description", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "exact_text", + "num_input": "video" + }, + { + "name": "google_streetview_line_sorting", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "move_pos_to_pos_hanoi_4_pole", + "score": 0.003968253968253968, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "flowchart_code_generation", + "score": 0.2222222222222222, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "topological_sort", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "google_streetview_line_reasoning", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "TRANCE_physics_reasoning_view", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "planning_screenshot_grippers", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_retrieval_given_surgeon_activity", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Videos", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "TRANCE_physics_reasoning_basic", + "score": 0.4117647058823529, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "code_add_tag", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "weather_map_climate_type_temperature_parsing", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "highest_discount_game_price_identification", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "media_QA_web_stackoverflow", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "newspaper_ocr_in_query_box", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "electricity_plot_future_prediction", + "score": 0.3275052631578947, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "coco_person_detection", + "score": 0.5092740145201512, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geometry_length", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "weather_info_parsing", + "score": 0.7103174603174602, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "av_view_identification", + "score": 0.05555555555555555, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "physical_property_reasoning", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "face_keypoint_detection", + "score": 0.848854419078294, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_maxflow", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "av_vehicle_multiview_counting", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "6-8 images" + }, + { + "name": "geometry_transformation", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "chess_puzzle_single_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ili_ratio_future_prediction", + "score": 0.04242857142857145, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geometry_analytic", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "mvsa_sentiment_classification", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "graph_theory", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "clevrer_video_moving_object_property_recognition", + "score": 0.625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "clevrer_video_moving_object_count", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 21, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "video" + }, + { + "name": "sta_action_localization_video", + "score": 0.1875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "code_programming_test_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "graph_hamiltonian_cycle", + "score": 0.24583333333333332, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "3d_indoor_scene_text_bbox_selection", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "vizwiz_quality_accessment_for_blind", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "funsd_document_qa", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "single_person_pose_estimation", + "score": 0.21327870239533622, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "deciphering_oracle_bone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "license_plate_recognition", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "science_molecule_chemistry", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "snli_ve_visual_entailment", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "table_understanding", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geometry_solid", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "tqa_textbook_qa", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "music_info_parsing", + "score": 0.48214285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_area", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "landmark_check_two_images", + "score": 0.6222222222222222, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "dvqa", + "score": 0.7894736842105263, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "cheapest_flight_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "code_programming_extremely_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Planning and Decision Making" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "widerface_face_count_and_event_classification", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "movie_info_retrieval", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "long_string_letter_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chess_winner_identification", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "math_convexity_value_estimation", + "score": 0.33392901390622537, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "multi_load_type_prediction_from_plot", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "ti_fused_vqa_math", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "human_relationship_reasoning", + "score": 0.75, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Commonsense and Social Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "vibe_eval_short_phrase", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "av_multicamera_tracking_predict_bbox", + "score": 0.01240457703042735, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "9-image or more" + }, + { + "name": "perception_test_video_character_order", + "score": 0.4375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "question_solution_solving", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "game_info_parsing", + "score": 0.8896103896103896, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "top_rated_hotel_identification", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "nextqa_mc", + "score": 0.9473684210526315, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "stock_price_future_prediction", + "score": 0.6950714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "algebra", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "graph_shortest_path_planar", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "action_sequence", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "9-image or more" + }, + { + "name": "map_diagram_qa", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "places365_similar_scene_retrieval", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "graph_shortest_path_kamada_kawai", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "star_object_interaction_video", + "score": 0.375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "graph_chordless_cycle", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "waybill_number_sequence_extraction", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "iconqa", + "score": 0.47368421052631576, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "graph_connectivity", + "score": 0.16666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "mnist_pattern", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "clevrer_moving_direction_video", + "score": 0.375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "egocentric_analysis_single_image", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "image_style_recognition", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "weather_info_retrieval", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "geometry_descriptive", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "IAM_line_ocr_and_locate", + "score": 0.38468048224583823, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "electricity_load_estimate_plot", + "score": 0.5441428571428572, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "insect_order_classification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "perception_test_object_shuffle_video", + "score": 0.1875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "muma_theory_of_mind_social_goal", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "ti_fused_vqa_biology", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "youtube_video_info_parsing", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "muma_theory_of_mind_belief_of_goal", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "quizlet_question_solving", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_chemistry", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "clevr_arithmetic", + "score": 0.631578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "exchange_rate_estimate_plot", + "score": 0.8993142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "multilingual_movie_info_parsing", + "score": 0.5510204081632654, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "hotel_booking_confirmation_parsing", + "score": 0.6285714285714284, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_programming_test_easy", + "score": 0.2916666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 24, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "brand_logo_recognition_and_elaboration", + "score": 0.66, + "eval_type": "rule", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "arc_agi", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "movie_info_parsing", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "3d_indoor_scene_text_bbox_prediction", + "score": 0.048240995325583465, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "video_to_camera_trajectory_retrieval", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "TV_show_info_parsing", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "stock_info_parsing", + "score": 0.9411764705882355, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "math_parity", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "visualdial_visual_dialog_image_guessing", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "famous_building_recognition", + "score": 0.8125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "egocentric_spatial_reasoning", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "nlvr2_two_image_compare_qa", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "vlnqa_egocentric_navigation_video", + "score": 0.125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "graph_isomorphism", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "code_output_result", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "landmark_recognition_and_qa", + "score": 0.4888888888888889, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "face_identity_matching", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "4-5 images" + }, + { + "name": "music_info_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "clevrer_object_existence_video", + "score": 0.75, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "cam_traj_to_video_selection", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "video_content_reasoning", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "3d_fragments_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "handwritten_math_expression_extraction", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_physics", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "pmc_vqa_medical_image_qa", + "score": 0.47368421052631576, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "long_string_number_recognition", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "electricity_future_prediction_from_table", + "score": 0.6126284210526315, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "places365_scene_type_classification", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_error_line_identification", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "code_programming_test_advanced", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "super_clevr", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "animal_pose_estimation", + "score": 0.1646958321823573, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "logo2k_same_type_logo_retrieval", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "science_basic_physics", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "kvqa_knowledge_aware_qa", + "score": 0.2631578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "math_breakpoint", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "newspaper_page_parse_and_count", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "multilingual_game_info_parsing", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "signboard_identification", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "chart_vqa", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "action_prediction", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "average_humidity_estimate_plot", + "score": 0.5973333333333334, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "photo_sharing_image_retrieval", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "humor_understand_caption_match", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "game_info_retrieval", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "emotion_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "6-8 images" + }, + { + "name": "figureqa", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "av_human_multiview_counting", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "6-8 images" + }, + { + "name": "perception_test_video_action_count", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "video" + }, + { + "name": "stock_info_retrieval", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "coco_object_detection_by_query_property", + "score": 0.5459704567502287, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "graph_hamiltonian_path", + "score": 0.25704365079365077, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "coco_ood_global_image_retrieval_by_query_property", + "score": 0.5011904761904762, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "traffic_future_prediction_from_line_plot", + "score": 0.6962105263157895, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_crushing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_equality", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_checkmate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_papers", + "score": 0.23333333333333334, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_indian_celebrity", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_chinese_celebrity", + "score": 0.10714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_famous_treaty", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "reward_models_T2I_reward", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "reward_models_I2T_reward", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "crossword_mini_5x5", + "score": 0.15714285714285717, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_motion_matching_real_3D", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "video_motion_matching_3D_real", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "number_puzzle_sudoku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "number_puzzle_kakuro_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "panel_images_single_question", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "panel_images_multi_question", + "score": 0.4523809523809523, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "table_understanding_complex_question_answering", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "table_understanding_fact_verification", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_position_images", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_xor_images", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_repeat_length", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_contain_length", + "score": 0.8, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_multi_contain_position_only", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_length", + "score": 0.4, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_contain_images", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_multi_contain_repeat", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_position_length", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_multi_contain_repeat_position_only_length", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "maze_2d_8x8", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_PolitiFact", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_GossipCop", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Object Recognition and Classification", + "Scene and Event Understanding", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_HatefulMemes", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Memotion", + "score": 0.6470588235294118, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_latex", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_csv", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_markdown", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_experience_plain", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_employer_plain", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_article_authors", + "score": 0.6071428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_article_journal", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_skill_plain", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_text_latex", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_equation", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_html", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_school_plain", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "pictionary_doodle_guess", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_chinese_food_img2en", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_cartoon_drawing_guess", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_genai_output_chinese", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_skribbl_io", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "counterfactual_arithmetic", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_youtube", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_iphone_settings", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_notes", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_word", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_alipay", + "score": 0.058823529411764705, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_twitter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_leetcode", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_tiktok", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_excel", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_instagram", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_ppt", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_zoom", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_amazon", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_layout_understanding_youtube", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_iphone_settings", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_notes", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_word", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_alipay", + "score": 0.11764705882352941, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_twitter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_leetcode", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_tiktok", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_excel", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_instagram", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_ppt", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_zoom", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_amazon", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_attribute_edit", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_out_of_context", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_background_change", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_entity_replace", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_clip_stable_diffusion_generate", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_style", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_veracity", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_swap", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_blogpost", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_homepage", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_publication", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "autorater_3d_model_texturing", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Metrics", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_subject", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_unmask", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_artifact_reason", + "score": 0.6, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "autorater_mask", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_aesthetics", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_motion_guided_editing", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "autorater_artifact", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "autorater_control", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_semantics", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "shape_composition_shapes", + "score": 0.21811224489795916, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "shape_composition_colours", + "score": 0.1498015873015873, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_math_TheoremQA", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_MATH", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "ball_cup_swap_3", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "poetry_petrarchian_sonnet_optional_meter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_shakespearean_sonnet", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_custom_rhyming_scheme", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_haiku", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic_alliteration", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_limerick", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "geographic_remote_sensing_land_cover", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "visual_prediction_rater_novel_view_synthesis", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_semantic_segmentation", + "score": 0.041666666666666664, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_panoptic_segmentation", + "score": 0.047619047619047616, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_surface_normal_estimation", + "score": 0.1904761904761905, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_openable_part_segmentation", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_plane_segmentation", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_depth_estimation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_3d_assembled_quality_understanding", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "symbolic_graphics_programs_computer_aided_design", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "symbolic_graphics_programs_scalable_vector_graphics", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_depth", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_relation", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_distance", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_count", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_africa", + "score": 0.028571428571428574, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_asia", + "score": 0.05714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_americas", + "score": 0.08571428571428572, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_europe", + "score": 0.09999999999999999, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "figurative_speech_explanation", + "score": 0.8103448275862066, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "tweets_captioning", + "score": 0.47142857142857136, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "science_figure_explanation", + "score": 0.6620689655172414, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "iq_test", + "score": 0.46896551724137925, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "sceneqa_scene_transition_video", + "score": 0.32142857142857145, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "wikihow_complex_task_completion", + "score": 0.6888888888888888, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "9-image or more" + }, + { + "name": "funqa_unexpected_action_humor_video", + "score": 0.3, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "image_captioning_with_additional_requirements", + "score": 0.8571428571428573, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "visualization_with_code", + "score": 0.35000000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocrqa", + "score": 0.7965517241379312, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "activitynetqa", + "score": 0.5157894736842106, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "generated_video_artifacts", + "score": 0.21874999999999997, + "eval_type": "llm", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "paper_review_writing", + "score": 0.5266666666666667, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "4-5 images" + }, + { + "name": "table2latex_complex", + "score": 0.5666666666666668, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "funqa_unexpected_action_creative_video", + "score": 0.31333333333333335, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "graph_interpretation", + "score": 0.7551724137931034, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "defeasible_reasoning", + "score": 0.7275862068965517, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "image_humor_understanding", + "score": 0.7551724137931035, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "doc_vqa", + "score": 0.8375000000000001, + "eval_type": "llm", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "4-5 images" + }, + { + "name": "video2notes", + "score": 0.5428571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "bar_chart_interpretation", + "score": 0.5758620689655174, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_short_title", + "score": 0.6428571428571429, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "humor_explanation", + "score": 0.49999999999999994, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "nextqa_oe", + "score": 0.32105263157894737, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "vibe-eval", + "score": 0.5214285714285715, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_summary", + "score": 0.6071428571428573, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "art_explanation", + "score": 0.48275862068965514, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "traffic_accident_analysis", + "score": 0.25, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_content_follow_up", + "score": 0.7071428571428573, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Planning and Decision Making" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "electrocardiogram", + "score": 0.24999999999999997, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_qa", + "score": 0.8285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "GUI_Chat_Easy", + "score": 0.75, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funny_image_title", + "score": 0.6071428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "scibench_w_solution_open_ended", + "score": 0.172, + "eval_type": "llm", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "meme_explain", + "score": 0.7000000000000001, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "GUI_Chat_Hard", + "score": 0.47419354838709676, + "eval_type": "llm", + "num_demo": 1, + "num_query": 31, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funqa_unexpected_action_magic_video", + "score": 0.5133333333333333, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "unusual_images", + "score": 0.7448275862068966, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "guess_image_generation_prompt", + "score": 0.7736842105263159, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_detail_description", + "score": 0.37894736842105275, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "docci_image_description_long", + "score": 0.6714285714285715, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "ascii_art_30", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Spanish", + "score": 0.4, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_French", + "score": 0.34285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Russian", + "score": 0.22142857142857145, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_English", + "score": 0.3928571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Arabic", + "score": 0.2785714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Japanese", + "score": 0.2642857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "table_understanding_fetaqa", + "score": 0.6642857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_advanced", + "score": 0.08571428571428573, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_expert", + "score": 0.35714285714285715, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_worldclass", + "score": 0.14285714285714285, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Spanish", + "score": 0.2928571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_French", + "score": 0.32142857142857145, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Russian", + "score": 0.2357142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Chinese", + "score": 0.5285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Arabic", + "score": 0.3714285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "red_teaming_visual_order_B", + "score": 0.8894736842105264, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "red_teaming_captcha", + "score": 0.10000000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visualmisleading", + "score": 0.836842105263158, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_celebrity", + "score": 0.7750000000000002, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_politics", + "score": 0.7000000000000001, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_jailbreak", + "score": 0.41, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Text Recognition (OCR)", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visual_order_A", + "score": 0.8842105263157897, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "red_teaming_racial", + "score": 0.7200000000000001, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + } +] \ No newline at end of file diff --git a/static/eval_results/Default/Claude_3.5/summary_results.json b/static/eval_results/Default/Claude_3.5/summary_results.json new file mode 100644 index 0000000000000000000000000000000000000000..d90792e8314fea5c53f068d815fd6ebdff3bd724 --- /dev/null +++ b/static/eval_results/Default/Claude_3.5/summary_results.json @@ -0,0 +1,251 @@ +{ + "model_summary": { + "core": { + "num_eval_tasks": 440, + "num_eval_samples": 6539, + "macro_mean_score": 0.5040975742801586, + "micro_mean_score": 0.5002259116666758 + }, + "open": { + "num_eval_tasks": 65, + "num_eval_samples": 1163, + "macro_mean_score": 0.6373907158949892, + "micro_mean_score": 0.6569647463456579 + }, + "overall_score": 0.5212541172602853 + }, + "keyword_stats": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.5405089647404562 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.6082834220752651 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.5745077617490254 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.5450038475783499 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.4767692987630454 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.5756126284078804 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.6969774436090224 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.5278843049497918 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.4082144793870471 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.23803578664609892 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.5691641481808987 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.4795267886975966 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.525848282456283 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.508735695828719 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.5699094130430454 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.5096772701625744 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.4429640420975014 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.5066797418318023 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.4971460788134188 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.5278127103234661 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.4490020843308984 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.5838224169821388 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.5456152399978661 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.46300075585789874 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.5414381873407914 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.5373019912310933 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.508735695828719 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.4422556748863689 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.49311554035078103 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.6663170946790707 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.3382015835012861 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.5194010220575684 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.532329797132399 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.5808831682303479 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.513474611293123 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.5507075880782885 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.47461998432626556 + } + } + } +} \ No newline at end of file diff --git a/static/eval_results/Default/Claude_3.5/task_results.json b/static/eval_results/Default/Claude_3.5/task_results.json new file mode 100644 index 0000000000000000000000000000000000000000..b2bbfdc4b11a52bdb098938b20cf495eb95baa2e --- /dev/null +++ b/static/eval_results/Default/Claude_3.5/task_results.json @@ -0,0 +1,7756 @@ +[ + { + "name": "code_translation_Python", + "score": 0.6458333333333334, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "vln_identify_robot", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "9-image or more" + }, + { + "name": "google_streetview_line_sorting", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "dish_ingredient_match", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "vln_identify_location", + "score": 0.3878787878787879, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "video_eval_visual_pref", + "score": 0.75, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "image_translation_en2cn", + "score": 0.47189890122171807, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "CLEVRER_physics", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "media_homepage_profile", + "score": 0.4997371675943104, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "logical_reasoning_find_odd_one_out", + "score": 0.8928571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_overlapped_circle", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Mobile_tap", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "TRANCE_physics_reasoning_basic", + "score": 0.8823529411764706, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "booking_web_rating", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "logical_reasoning_fit_pattern", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "knowledge_sign_recognition", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "ishihara_test", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "relative_depth_of_different_points", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "autonomous_driving_scene_analysis", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_count_line_intersections", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_translation_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "llavaguard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "knowledge_graph_understanding", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "game_platform_support_identification", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_content_based_retrieval_radiology", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "planning_visual_blocksworld", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "recover_masked_word_in_figure", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "healthcare_info_judgement", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Ethical and Safety Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "photoshop_operation", + "score": 0.32857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "pokemon_3D_recognition", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "multilingual_news_qa", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "worldle", + "score": 0.31144102130193474, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "rocks_samples_identify", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "circuit_diagram_understanding", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "paper_vqa", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "product_ocr_qa", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "counting", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "TV_show_retrieval_by_character", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "planning_visual_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "multiview_reasoning_camera_moving", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "visual_correspondance_in_two_images", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "music_sheet_note_count", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "video_segments_reordering", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "structured_output", + "num_input": "video" + }, + { + "name": "code_match_problem", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "media_recommend_solutions_stackoverflow", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "code_translation_advanced", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "planning_visual_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "signage_navigation", + "score": 0.8666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "google_streetview_circle_reasoning", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "media_QA_web_stackoverflow", + "score": 0.6666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "vln_english_next_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "sign_language", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "video_grounding_spatial", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "Ad_count_detection", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "move_pos_to_pos_hanoi_4_pole", + "score": 0.003968253968253968, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "geometry_reasoning_grid", + "score": 0.9642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "2d_image_jigsaw_puzzle_easy", + "score": 0.3428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "location_vqa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "flowchart_code_generation", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "medical_polyp_segmentation_single_object_rater", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "geometry_reasoning_circled_letter", + "score": 0.8214285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "song_title_identification_from_lyrics", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "transit_map_intersection_points", + "score": 0.7172619047619049, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "webpage_code_understanding", + "score": 0.7777777777777778, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "google_streetview_direction_understanding", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "chess_find_legal_moves", + "score": 0.06698805429719713, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "topological_sort", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MMMU_pro_exam_screenshot", + "score": 0.42424242424242425, + "eval_type": "rule", + "num_demo": 1, + "num_query": 99, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "Forensic_Detection_of_different_images", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "TRANCE_physics_reasoning_event", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "monthly_weather_days_count", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "mindmap_elements_parsing", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "interpret_force_perspective_illusion", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_solution_compare", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "functionality_matching_in_different_objects", + "score": 0.5357142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "annoying_word_search", + "score": 0.0035714285714285718, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "planning_visual_storage", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "google_streetview_line_reasoning", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "geometry_reasoning_nested_squares", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "polygon_interior_angles", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "ancient_map_understanding", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "GUI_Act_Web_Multi", + "score": 0.4642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "rocks_samples_compare", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "mensa_iq_test", + "score": 0.5495098039215687, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "top_video_creator_identification", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_2D_views_of_3D_shapes", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "music_sheet_sentiment", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "vln_tegulu_next_step", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "paper_review_rating", + "score": 0.6543300312736264, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "numerical_data", + "num_input": "4-5 images" + }, + { + "name": "video_camera_motion_description", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "exact_text", + "num_input": "video" + }, + { + "name": "play_go_capture_stone", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "medical_multi_organ_segmentation_rater", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ascii_art_understanding", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Web_Single", + "score": 0.07140372068949602, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "video_grounding_temporal", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "planning_screenshot_grippers", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_intent_recognition", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "counting_multi_image", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "calendar_schedule_suggestion", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "chinese_idiom_recognition", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "web_action_prediction", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "video_action_recognition", + "score": 0.8214285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "structured_output", + "num_input": "video" + }, + { + "name": "highest_discount_game_price_identification", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Mobile_swipe", + "score": 0.5487385867546344, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_eval_factual_pref", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "road_map_find_highway_between_two_place", + "score": 0.8235294117647058, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_execution", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_translation_easy", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "music_sheet_format_QA", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "position_relationship", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "booking_web_recommendation", + "score": 0.7624716553287981, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "planning_screenshot_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "music_sheet_author", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "actor_recognition_in_Movie", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "font_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "extract_webpage_headline", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "hashtag_recommendation", + "score": 0.9583333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "Bongard_Problem", + "score": 0.3157894736842105, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "relative_reflectance_of_different_regions", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_physics", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "coco_ood_global_image_retrieval_by_query_property", + "score": 0.8601190476190477, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "multilingual_movie_info_parsing", + "score": 0.7448979591836732, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "cheapest_flight_identification", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "planning_screenshot_blocksworld", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "soccer_offside", + "score": 0.2222222222222222, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "electricity_load_estimate_plot", + "score": 0.6787142857142856, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "pmc_vqa_medical_image_qa", + "score": 0.7894736842105263, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "waldo", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "number_comparison", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "video_to_camera_trajectory_retrieval", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "medical_keywords_based_retrieval_non_radiology", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "movie_info_parsing", + "score": 0.7321428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "paper_review_acceptance", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "code_programming_test_easy", + "score": 0.5416666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 24, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_storage", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "perception_test_object_shuffle_video", + "score": 0.1875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "insect_order_classification", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "face_identity_matching", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "4-5 images" + }, + { + "name": "scibench_calculus_wo_solution", + "score": 0.22448979591836735, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "graph_shortest_path_kamada_kawai", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "medical_parasite_detection", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "places365_similar_scene_retrieval", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "mahjong", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "multi_load_type_prediction_from_plot", + "score": 0.46428571428571425, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "code_visualization_output_understanding", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 10, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "chess_sygyzy_endgames", + "score": 0.09714285714285713, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "human_relationship_reasoning", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Commonsense and Social Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "Movie_retrieval_by_actor", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "rebus", + "score": 0.5217391304347826, + "eval_type": "rule", + "num_demo": 1, + "num_query": 23, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "muma_theory_of_mind_social_goal", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "graph_shortest_path_planar", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "comic_page_ordering", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "iconqa", + "score": 0.5263157894736842, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "LaTeX_complex_formula_convertion", + "score": 0.23529411764705882, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "stock_info_parsing", + "score": 0.9747899159663866, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "icon_arithmetic_puzzle", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "stock_price_future_prediction", + "score": 0.8250714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "clevrer_moving_direction_video", + "score": 0.375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "video_eval_dynamic_pref", + "score": 0.875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "geometry_length", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "orchestra_score_recognition", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "3d_fragments_understanding", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "web_action_grounding", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "vizwiz_quality_accessment_for_blind", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "logical_reasoning_2d_folding", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "traffic_future_prediction_from_line_plot", + "score": 0.6207368421052633, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_biology", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "vln_hindi_next_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "remaining_playback_time_calculation", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "muma_theory_of_mind_belief_of_goal", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "animal_pose_estimation", + "score": 0.2688508092335989, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "weather_info_retrieval", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "average_humidity_estimate_plot", + "score": 0.7853333333333332, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "vlnqa_egocentric_navigation_video", + "score": 0.5625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "music_info_parsing", + "score": 0.7053571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_programming_test_hard", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "nlvr2_two_image_compare_qa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "geometry_transformation", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multilingual_game_info_parsing", + "score": 0.8303571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "egocentric_analysis_single_image", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "landmark_recognition_and_qa", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "action_sequence", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "9-image or more" + }, + { + "name": "graph_connectivity", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "youtube_video_info_parsing", + "score": 0.6904761904761906, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "physical_property_reasoning", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "av_vehicle_multiview_counting", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "6-8 images" + }, + { + "name": "arc_agi", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "IAM_line_ocr_and_locate", + "score": 0.8184827502429544, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "license_plate_recognition", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "av_human_multiview_counting", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "6-8 images" + }, + { + "name": "clevrer_object_existence_video", + "score": 0.5625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "famous_building_recognition", + "score": 0.84375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "emotion_recognition", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "6-8 images" + }, + { + "name": "TV_show_info_parsing", + "score": 0.7698412698412698, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "image_style_recognition", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "graph_theory", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "electricity_plot_future_prediction", + "score": 0.7095421052631579, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "game_info_retrieval", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "mnist_pattern", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "graph_isomorphism", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "landmark_check_two_images", + "score": 0.7777777777777779, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "nextqa_mc", + "score": 0.8421052631578947, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "graph_hamiltonian_cycle", + "score": 0.37499999999999994, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "stock_info_retrieval", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "long_string_letter_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "action_prediction", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "geometry_area", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "av_view_identification", + "score": 0.27777777777777773, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "newspaper_ocr_in_query_box", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_programming_test_advanced", + "score": 0.24074074074074073, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "code_programming_extremely_hard", + "score": 0.0625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Planning and Decision Making" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "ti_fused_vqa_math", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "perception_test_video_character_order", + "score": 0.6875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "coco_object_detection_by_query_property", + "score": 0.5565966568582713, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "3d_indoor_scene_text_bbox_prediction", + "score": 0.04739437903890144, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "top_rated_hotel_identification", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "widerface_face_count_and_event_classification", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "math_parity", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "snli_ve_visual_entailment", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "science_basic_physics", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "face_keypoint_detection", + "score": 0.5987447167547407, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "music_info_retrieval", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "figureqa", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "chess_winner_identification", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "algebra", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "clevrer_video_moving_object_count", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 21, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "video" + }, + { + "name": "math_convexity_value_estimation", + "score": 0.5753130452443872, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "map_diagram_qa", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "deciphering_oracle_bone", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "funsd_document_qa", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "newspaper_page_parse_and_count", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "weather_info_parsing", + "score": 0.9087301587301589, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "vibe_eval_short_phrase", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "signboard_identification", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "game_info_parsing", + "score": 0.9415584415584416, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "cam_traj_to_video_selection", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "clevrer_video_moving_object_property_recognition", + "score": 0.75, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "geometry_analytic", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "long_string_number_recognition", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "waybill_number_sequence_extraction", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "egocentric_spatial_reasoning", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_error_line_identification", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "single_person_pose_estimation", + "score": 0.333520279485717, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "photo_sharing_image_retrieval", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "quizlet_question_solving", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "chart_vqa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "electricity_future_prediction_from_table", + "score": 0.7636842105263157, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "humor_understand_caption_match", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "hotel_booking_confirmation_parsing", + "score": 0.7071428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "sta_action_localization_video", + "score": 0.3125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "geometry_descriptive", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "coco_person_detection", + "score": 0.5531252543894322, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "video_content_reasoning", + "score": 0.7777777777777778, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "graph_maxflow", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "places365_scene_type_classification", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_blogpost", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_publication", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_homepage", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "reward_models_I2T_reward", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "reward_models_T2I_reward", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_contain_images", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_repeat_length", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_multi_contain_repeat_position_only_length", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_length", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_position_images", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_position_length", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_xor_images", + "score": 0.8, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_multi_contain_repeat", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_contain_length", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_multi_contain_position_only", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "panel_images_single_question", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "panel_images_multi_question", + "score": 0.8095238095238094, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_checkmate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_equality", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_crushing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "table_understanding_fact_verification", + "score": 0.9047619047619049, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "table_understanding_complex_question_answering", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "number_puzzle_sudoku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "number_puzzle_kakuro_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_chinese_food_img2en", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_skribbl_io", + "score": 0.15, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_genai_output_chinese", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_doodle_guess", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_cartoon_drawing_guess", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_swap", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_veracity", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_out_of_context", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_background_change", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_clip_stable_diffusion_generate", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_style", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_entity_replace", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_attribute_edit", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_leetcode", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_instagram", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_iphone_settings", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_ppt", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_notes", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_amazon", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_excel", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_youtube", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_twitter", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_alipay", + "score": 0.6470588235294118, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_zoom", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_word", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_tiktok", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "crossword_mini_5x5", + "score": 0.7714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ball_cup_swap_3", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "music_sheet_name", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "code_retrieval", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "weather_map_climate_type_temperature_parsing", + "score": 0.8214285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_visual_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "recipe_image_ordering", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "distinguish_ai_generated_image", + "score": 0.631578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_tyreworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "google_streetview_circle_sorting", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "semantic_matching_of_two_images", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "TRANCE_physics_reasoning_view", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "entertainment_web_game_style", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "scibench_fundamental_wo_solution", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_visual_grippers", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "brand_logo_recognition_and_elaboration", + "score": 0.82, + "eval_type": "rule", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "logo2k_same_type_logo_retrieval", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "clevr_arithmetic", + "score": 0.631578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "super_clevr", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "visualdial_visual_dialog_image_guessing", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "science_molecule_chemistry", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_chemistry", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "graph_hamiltonian_path", + "score": 0.375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "perception_test_video_action_count", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "video" + }, + { + "name": "star_object_interaction_video", + "score": 0.4375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "chess_puzzle_single_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "movie_info_retrieval", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "exchange_rate_estimate_plot", + "score": 0.9841571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "graph_chordless_cycle", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "av_multicamera_tracking_predict_bbox", + "score": 0.03751549483739501, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "9-image or more" + }, + { + "name": "mvsa_sentiment_classification", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "math_breakpoint", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "ili_ratio_future_prediction", + "score": 0.3397142857142856, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "dvqa", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geometry_solid", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "question_solution_solving", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "autorater_3d_model_texturing", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Metrics", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_aesthetics", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_artifact_reason", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "autorater_subject", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_motion_guided_editing", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "autorater_unmask", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_semantics", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_mask", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_control", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_artifact", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "counterfactual_arithmetic", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic_alliteration", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_shakespearean_sonnet", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_haiku", + "score": 0.4, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_petrarchian_sonnet_optional_meter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_limerick", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_custom_rhyming_scheme", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "shape_composition_shapes", + "score": 0.5346938775510204, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "shape_composition_colours", + "score": 0.4522108843537415, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_article_authors", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_html", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_article_journal", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_skill_plain", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_equation", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_latex", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_experience_plain", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_employer_plain", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_text_latex", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_markdown", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_school_plain", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_csv", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_indian_celebrity", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_chinese_celebrity", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_famous_treaty", + "score": 0.5357142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_papers", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_math_MATH", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "ocr_math_TheoremQA", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MMSoc_HatefulMemes", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_PolitiFact", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_GossipCop", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Object Recognition and Classification", + "Scene and Event Understanding", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Memotion", + "score": 0.5529411764705884, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_leetcode", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_instagram", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_iphone_settings", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_ppt", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "medical_abdomen_MRI_organ_recognition", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "medical_cell_recognition", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "medical_image_artifacts_indentification", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "medical_abdomen_endscopy_organ_recognition", + "score": 0.34523809523809523, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "medical_retrieval_given_surgeon_activity", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Videos", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "medical_counting_lymphocytes", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "medical_blood_vessels_recognition", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_amazon", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_excel", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_youtube", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_twitter", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_alipay", + "score": 0.8235294117647058, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_zoom", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_word", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_tiktok", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "maze_2d_8x8", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "visual_prediction_rater_depth_estimation", + "score": 0.47619047619047616, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_plane_segmentation", + "score": 0.6222222222222221, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_openable_part_segmentation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_panoptic_segmentation", + "score": 0.5952380952380952, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_surface_normal_estimation", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_3d_assembled_quality_understanding", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "visual_prediction_rater_novel_view_synthesis", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_semantic_segmentation", + "score": 0.4166666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "cvbench_adapted_cvbench_relation", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_distance", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_depth", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_count", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "symbolic_graphics_programs_scalable_vector_graphics", + "score": 0.1111111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "symbolic_graphics_programs_computer_aided_design", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_africa", + "score": 0.8142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_europe", + "score": 0.7, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_asia", + "score": 0.5857142857142856, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_americas", + "score": 0.7, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "geographic_remote_sensing_land_cover", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "video_motion_matching_3D_real", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "video_motion_matching_real_3D", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "cultural_vqa", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "character_recognition_in_TV_shows", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_output_result", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "kvqa_knowledge_aware_qa", + "score": 0.3157894736842105, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "tqa_textbook_qa", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "painting_QA", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "MMMU_physics_chemistry_MCQ", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "arxiv_vqa", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "realworld_qa_en2cn", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_add_tag", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "table_understanding", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "handwritten_math_expression_extraction", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "3d_indoor_scene_text_bbox_selection", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_notes", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "red_teaming_racial", + "score": 0.8450000000000001, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "ascii_art_30", + "score": 0.2857142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "table2latex_complex", + "score": 0.7222222222222222, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "meme_explain", + "score": 0.8142857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_summary", + "score": 0.6357142857142858, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "funqa_unexpected_action_magic_video", + "score": 0.58, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "paper_review_writing", + "score": 0.58, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "4-5 images" + }, + { + "name": "activitynetqa", + "score": 0.4, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "scibench_w_solution_open_ended", + "score": 0.258, + "eval_type": "llm", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "generated_video_artifacts", + "score": 0.325, + "eval_type": "llm", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "funny_image_title", + "score": 0.6928571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "wikihow_complex_task_completion", + "score": 0.8444444444444446, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "9-image or more" + }, + { + "name": "video_detail_description", + "score": 0.38947368421052636, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "funqa_unexpected_action_creative_video", + "score": 0.32666666666666655, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "guess_image_generation_prompt", + "score": 0.8473684210526317, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "traffic_accident_analysis", + "score": 0.5357142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_qa", + "score": 0.8857142857142859, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "vibe-eval", + "score": 0.4928571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "nextqa_oe", + "score": 0.3684210526315789, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "video_short_title", + "score": 0.7500000000000001, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "video2notes", + "score": 0.7142857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "image_humor_understanding", + "score": 0.8931034482758619, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "graph_interpretation", + "score": 0.882758620689655, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "science_figure_explanation", + "score": 0.8551724137931034, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "ocrqa", + "score": 0.8827586206896549, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "electrocardiogram", + "score": 0.3285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "GUI_Chat_Easy", + "score": 0.7307692307692307, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "doc_vqa", + "score": 0.8750000000000001, + "eval_type": "llm", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "4-5 images" + }, + { + "name": "docci_image_description_long", + "score": 0.7928571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "humor_explanation", + "score": 0.5866666666666666, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_content_follow_up", + "score": 0.8500000000000002, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Planning and Decision Making" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "GUI_Chat_Hard", + "score": 0.45806451612903226, + "eval_type": "llm", + "num_demo": 1, + "num_query": 31, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "iq_test", + "score": 0.6482758620689654, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "unusual_images", + "score": 0.8931034482758619, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "tweets_captioning", + "score": 0.6499999999999998, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "art_explanation", + "score": 0.7517241379310345, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bar_chart_interpretation", + "score": 0.6931034482758621, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funqa_unexpected_action_humor_video", + "score": 0.3733333333333333, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "figurative_speech_explanation", + "score": 0.8310344827586205, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "defeasible_reasoning", + "score": 0.8551724137931035, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "image_captioning_with_additional_requirements", + "score": 0.9357142857142858, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "visualization_with_code", + "score": 0.65, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "table_understanding_fetaqa", + "score": 0.7071428571428572, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_jailbreak", + "score": 0.9349999999999999, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Text Recognition (OCR)", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_celebrity", + "score": 0.8850000000000001, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_captcha", + "score": 0.10000000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visual_order_B", + "score": 0.905263157894737, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "red_teaming_politics", + "score": 0.8100000000000002, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visual_order_A", + "score": 0.905263157894737, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "red_teaming_visualmisleading", + "score": 0.8789473684210528, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_worldclass", + "score": 0.33571428571428574, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_advanced", + "score": 0.3071428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_expert", + "score": 0.4142857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Spanish", + "score": 0.5428571428571429, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Russian", + "score": 0.7000000000000001, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Arabic", + "score": 0.6071428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Chinese", + "score": 0.6928571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_French", + "score": 0.6357142857142858, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Spanish", + "score": 0.4928571428571429, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_English", + "score": 0.4714285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Russian", + "score": 0.4357142857142858, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Arabic", + "score": 0.5785714285714285, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Japanese", + "score": 0.5714285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_French", + "score": 0.47857142857142854, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "sceneqa_scene_transition_video", + "score": 0.32857142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "video" + } +] \ No newline at end of file diff --git a/static/eval_results/Default/Claude_3.5_new/summary_results.json b/static/eval_results/Default/Claude_3.5_new/summary_results.json new file mode 100644 index 0000000000000000000000000000000000000000..b27da6920bcbd055a5c65f822bb65e8153eeedae --- /dev/null +++ b/static/eval_results/Default/Claude_3.5_new/summary_results.json @@ -0,0 +1,251 @@ +{ + "model_summary": { + "core": { + "num_eval_tasks": 440, + "num_eval_samples": 6539, + "macro_mean_score": 0.5259191914020757, + "micro_mean_score": 0.5230785894131227 + }, + "open": { + "num_eval_tasks": 65, + "num_eval_samples": 1163, + "macro_mean_score": 0.6563419761104125, + "micro_mean_score": 0.6724419604471196 + }, + "overall_score": 0.5427062825031487 + }, + "keyword_stats": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.5690045172520449 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.6220681231036606 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.6077980666415158 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.5511440615639541 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.4885536652013625 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.5908204006544897 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.6569473684210526 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.5486763511384175 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.4315385951907387 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.2909419331017877 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.6048192628845258 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.48924295292319175 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.556418710368288 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.4946691340754988 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.5558756390298104 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.5425198547046186 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.44210335381541843 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.5187252051932875 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.5071121107460066 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.5387340524651681 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.4824302644151348 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.6242798397166945 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.5782691045270721 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.4630277507828528 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.5914338446093256 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.5636254729390459 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.4946691340754988 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.4828123870640382 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.48756636014597515 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.6590137441693218 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.39901670035164916 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.5166853031535193 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.5561634744977417 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.6123769274172342 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.5512015158810595 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.565796566886933 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.4763267502912362 + } + } + } +} \ No newline at end of file diff --git a/static/eval_results/Default/Claude_3.5_new/task_results.json b/static/eval_results/Default/Claude_3.5_new/task_results.json new file mode 100644 index 0000000000000000000000000000000000000000..9d6ca93f5f2b7acb7f4f83f3d7d701df0fea1f95 --- /dev/null +++ b/static/eval_results/Default/Claude_3.5_new/task_results.json @@ -0,0 +1,7756 @@ +[ + { + "name": "vln_identify_robot", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "9-image or more" + }, + { + "name": "code_translation_easy", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "dish_ingredient_match", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "medical_cell_recognition", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "position_relationship", + "score": 0.8666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "video_eval_visual_pref", + "score": 0.875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "booking_web_recommendation", + "score": 0.6753968253968254, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "logical_reasoning_find_odd_one_out", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "music_sheet_author", + "score": 0.375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "cultural_vqa", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "font_recognition", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "TRANCE_physics_reasoning_basic", + "score": 0.7058823529411765, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "Bongard_Problem", + "score": 0.23684210526315788, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_fit_pattern", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "knowledge_sign_recognition", + "score": 0.2222222222222222, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "code_translation_Python", + "score": 0.6041666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "relative_depth_of_different_points", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "autonomous_driving_scene_analysis", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "google_streetview_line_sorting", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "geometry_reasoning_count_line_intersections", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "vln_identify_location", + "score": 0.21818181818181817, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "llavaguard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "knowledge_graph_understanding", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "image_translation_en2cn", + "score": 0.40241040325976846, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "medical_content_based_retrieval_radiology", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "CLEVRER_physics", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "media_homepage_profile", + "score": 0.4714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "recover_masked_word_in_figure", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_overlapped_circle", + "score": 0.75, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "photoshop_operation", + "score": 0.4095238095238095, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "medical_image_artifacts_indentification", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "multilingual_news_qa", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "GUI_Act_Mobile_tap", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "rocks_samples_identify", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "booking_web_rating", + "score": 0.9642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "paper_vqa", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "product_ocr_qa", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "medical_blood_vessels_recognition", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ishihara_test", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_visual_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "visual_correspondance_in_two_images", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "code_translation_hard", + "score": 0.10714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "character_recognition_in_TV_shows", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "video_segments_reordering", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "structured_output", + "num_input": "video" + }, + { + "name": "game_platform_support_identification", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "media_recommend_solutions_stackoverflow", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "planning_visual_blocksworld", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "healthcare_info_judgement", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Ethical and Safety Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "planning_visual_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "pokemon_3D_recognition", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "medical_abdomen_MRI_organ_recognition", + "score": 0.3154761904761905, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "worldle", + "score": 0.32335405958224, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "signage_navigation", + "score": 0.8666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "circuit_diagram_understanding", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "vln_english_next_step", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "sign_language", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "Ad_count_detection", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_barman", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "2d_image_jigsaw_puzzle_easy", + "score": 0.2928571428571428, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "counting", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "flowchart_code_generation", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "TV_show_retrieval_by_character", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "painting_QA", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "multiview_reasoning_camera_moving", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "music_sheet_note_count", + "score": 0.058823529411764705, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "song_title_identification_from_lyrics", + "score": 0.6071428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_match_problem", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "code_translation_advanced", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "google_streetview_circle_reasoning", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "media_QA_web_stackoverflow", + "score": 0.5952380952380951, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "realworld_qa_en2cn", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MMMU_pro_exam_screenshot", + "score": 0.5151515151515151, + "eval_type": "rule", + "num_demo": 1, + "num_query": 99, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "video_grounding_spatial", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "TRANCE_physics_reasoning_event", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "move_pos_to_pos_hanoi_4_pole", + "score": 0.003968253968253968, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "monthly_weather_days_count", + "score": 0.40476190476190477, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_grid", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_solution_compare", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "location_vqa", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "functionality_matching_in_different_objects", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "medical_polyp_segmentation_single_object_rater", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "geometry_reasoning_circled_letter", + "score": 0.9642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_visual_storage", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "transit_map_intersection_points", + "score": 0.6607142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_nested_squares", + "score": 0.6071428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "webpage_code_understanding", + "score": 0.7777777777777778, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Web_Multi", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "google_streetview_direction_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "mensa_iq_test", + "score": 0.4946078431372549, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_find_legal_moves", + "score": 0.06110399705595322, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "music_sheet_sentiment", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "topological_sort", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "paper_review_rating", + "score": 0.6764671197732565, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "numerical_data", + "num_input": "4-5 images" + }, + { + "name": "Forensic_Detection_of_different_images", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "play_go_capture_stone", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_add_tag", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "medical_multi_organ_segmentation_rater", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "mindmap_elements_parsing", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MMMU_physics_chemistry_MCQ", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "interpret_force_perspective_illusion", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "arxiv_vqa", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "annoying_word_search", + "score": 0.003401360544217687, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_grippers", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "counting_multi_image", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "google_streetview_line_reasoning", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "chinese_idiom_recognition", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "polygon_interior_angles", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "video_action_recognition", + "score": 0.8214285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "structured_output", + "num_input": "video" + }, + { + "name": "medical_abdomen_endscopy_organ_recognition", + "score": 0.17857142857142858, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "GUI_Act_Mobile_swipe", + "score": 0.5750644816731951, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ancient_map_understanding", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "road_map_find_highway_between_two_place", + "score": 0.7647058823529411, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "rocks_samples_compare", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "top_video_creator_identification", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "google_streetview_circle_sorting", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "logical_reasoning_2D_views_of_3D_shapes", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "semantic_matching_of_two_images", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "TRANCE_physics_reasoning_view", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "vln_tegulu_next_step", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "entertainment_web_game_style", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_camera_motion_description", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "exact_text", + "num_input": "video" + }, + { + "name": "ascii_art_understanding", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Web_Single", + "score": 0.02989318393830872, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "video_grounding_temporal", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "scibench_fundamental_wo_solution", + "score": 0.24489795918367346, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "video_intent_recognition", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "calendar_schedule_suggestion", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "planning_visual_grippers", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "web_action_prediction", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_physics", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "highest_discount_game_price_identification", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "coco_ood_global_image_retrieval_by_query_property", + "score": 0.7452380952380953, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "multilingual_movie_info_parsing", + "score": 0.6530612244897959, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_eval_factual_pref", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "cheapest_flight_identification", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "code_execution", + "score": 0.375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "electricity_load_estimate_plot", + "score": 0.7402142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "music_sheet_format_QA", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "pmc_vqa_medical_image_qa", + "score": 0.7368421052631579, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "video_to_camera_trajectory_retrieval", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "movie_info_parsing", + "score": 0.7589285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "actor_recognition_in_Movie", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "extract_webpage_headline", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_programming_test_easy", + "score": 0.4583333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 24, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "hashtag_recommendation", + "score": 0.9583333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "relative_reflectance_of_different_regions", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "perception_test_object_shuffle_video", + "score": 0.4375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "insect_order_classification", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "face_identity_matching", + "score": 0.8666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "4-5 images" + }, + { + "name": "planning_screenshot_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_shortest_path_kamada_kawai", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_blocksworld", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "places365_similar_scene_retrieval", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "soccer_offside", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "multi_load_type_prediction_from_plot", + "score": 0.5357142857142858, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "waldo", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "table_understanding", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "number_comparison", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "human_relationship_reasoning", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Commonsense and Social Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "medical_keywords_based_retrieval_non_radiology", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "paper_review_acceptance", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "muma_theory_of_mind_social_goal", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "graph_shortest_path_planar", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_storage", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "iconqa", + "score": 0.42105263157894735, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "stock_info_parsing", + "score": 0.9747899159663866, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "stock_price_future_prediction", + "score": 0.8218571428571428, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "scibench_calculus_wo_solution", + "score": 0.32653061224489793, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "clevrer_moving_direction_video", + "score": 0.375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "medical_parasite_detection", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "geometry_length", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "mahjong", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "3d_fragments_understanding", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "code_visualization_output_understanding", + "score": 0.3, + "eval_type": "rule", + "num_demo": 1, + "num_query": 10, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "vizwiz_quality_accessment_for_blind", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "chess_sygyzy_endgames", + "score": 0.18566544566544566, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "traffic_future_prediction_from_line_plot", + "score": 0.8389999999999999, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "Movie_retrieval_by_actor", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "ti_fused_vqa_biology", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "rebus", + "score": 0.5217391304347826, + "eval_type": "rule", + "num_demo": 1, + "num_query": 23, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "comic_page_ordering", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "muma_theory_of_mind_belief_of_goal", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "LaTeX_complex_formula_convertion", + "score": 0.17647058823529413, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "icon_arithmetic_puzzle", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "animal_pose_estimation", + "score": 0.26289170215820523, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "medical_retrieval_given_surgeon_activity", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Videos", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "weather_info_retrieval", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "average_humidity_estimate_plot", + "score": 0.868, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "video_eval_dynamic_pref", + "score": 0.8125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "vlnqa_egocentric_navigation_video", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "orchestra_score_recognition", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "music_info_parsing", + "score": 0.6875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "web_action_grounding", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_2d_folding", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "code_programming_test_hard", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "nlvr2_two_image_compare_qa", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "vln_hindi_next_step", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "geometry_transformation", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "remaining_playback_time_calculation", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "multilingual_game_info_parsing", + "score": 0.6875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "music_sheet_name", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "egocentric_analysis_single_image", + "score": 0.2222222222222222, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "landmark_recognition_and_qa", + "score": 0.711111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_retrieval", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "action_sequence", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "9-image or more" + }, + { + "name": "weather_map_climate_type_temperature_parsing", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_connectivity", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "youtube_video_info_parsing", + "score": 0.7261904761904762, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "physical_property_reasoning", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "planning_visual_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "recipe_image_ordering", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "av_vehicle_multiview_counting", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "6-8 images" + }, + { + "name": "arc_agi", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "distinguish_ai_generated_image", + "score": 0.631578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "medical_counting_lymphocytes", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "IAM_line_ocr_and_locate", + "score": 0.8199708454810496, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "license_plate_recognition", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "av_human_multiview_counting", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "6-8 images" + }, + { + "name": "planning_screenshot_tyreworld", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "clevrer_object_existence_video", + "score": 0.5625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "face_keypoint_detection", + "score": 0.6805363628538211, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "famous_building_recognition", + "score": 0.78125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "music_info_retrieval", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "emotion_recognition", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "6-8 images" + }, + { + "name": "figureqa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "TV_show_info_parsing", + "score": 0.8650793650793652, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_winner_identification", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "image_style_recognition", + "score": 0.8666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "graph_theory", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "algebra", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "electricity_plot_future_prediction", + "score": 0.8811526315789474, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "clevrer_video_moving_object_count", + "score": 0.5238095238095238, + "eval_type": "rule", + "num_demo": 1, + "num_query": 21, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "video" + }, + { + "name": "math_convexity_value_estimation", + "score": 0.6339993725717702, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "game_info_retrieval", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "map_diagram_qa", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "mnist_pattern", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "deciphering_oracle_bone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "funsd_document_qa", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "graph_isomorphism", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "newspaper_page_parse_and_count", + "score": 0.4444444444444445, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "landmark_check_two_images", + "score": 0.4444444444444445, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "weather_info_parsing", + "score": 0.9166666666666669, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_output_result", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "vibe_eval_short_phrase", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "3d_indoor_scene_text_bbox_selection", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "nextqa_mc", + "score": 0.8947368421052632, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "signboard_identification", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "graph_hamiltonian_cycle", + "score": 0.4053571428571428, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "game_info_parsing", + "score": 0.9480519480519481, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "stock_info_retrieval", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "cam_traj_to_video_selection", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "long_string_letter_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "clevrer_video_moving_object_property_recognition", + "score": 0.75, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "kvqa_knowledge_aware_qa", + "score": 0.47368421052631576, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "action_prediction", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "geometry_analytic", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "long_string_number_recognition", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geometry_area", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "waybill_number_sequence_extraction", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "egocentric_spatial_reasoning", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "av_view_identification", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "code_error_line_identification", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "newspaper_ocr_in_query_box", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "single_person_pose_estimation", + "score": 0.2531109353882501, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "photo_sharing_image_retrieval", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "code_programming_test_advanced", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "quizlet_question_solving", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "chart_vqa", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "electricity_future_prediction_from_table", + "score": 0.7710526315789472, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "code_programming_extremely_hard", + "score": 0.09375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Planning and Decision Making" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "humor_understand_caption_match", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_math", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "hotel_booking_confirmation_parsing", + "score": 0.7642857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "perception_test_video_character_order", + "score": 0.75, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "sta_action_localization_video", + "score": 0.4375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "coco_object_detection_by_query_property", + "score": 0.6062431664706708, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geometry_descriptive", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "3d_indoor_scene_text_bbox_prediction", + "score": 0.08106406283795066, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "coco_person_detection", + "score": 0.6274393183836207, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "top_rated_hotel_identification", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "video_content_reasoning", + "score": 0.7777777777777778, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "widerface_face_count_and_event_classification", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_maxflow", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "math_parity", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "places365_scene_type_classification", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "snli_ve_visual_entailment", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_blogpost", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_publication", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "brand_logo_recognition_and_elaboration", + "score": 0.82, + "eval_type": "rule", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_homepage", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "logo2k_same_type_logo_retrieval", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "reward_models_I2T_reward", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "clevr_arithmetic", + "score": 0.5263157894736842, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "reward_models_T2I_reward", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "super_clevr", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_contain_images", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "visualdial_visual_dialog_image_guessing", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "constrained_generation_contain_repeat_length", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "science_molecule_chemistry", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "constrained_generation_multi_contain_repeat_position_only_length", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "handwritten_math_expression_extraction", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_length", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_chemistry", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_position_images", + "score": 0.8666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_position_length", + "score": 0.8666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "graph_hamiltonian_path", + "score": 0.2912414965986394, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "constrained_generation_xor_images", + "score": 0.8666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "perception_test_video_action_count", + "score": 0.125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "video" + }, + { + "name": "constrained_generation_multi_contain_repeat", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "star_object_interaction_video", + "score": 0.375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "constrained_generation_contain_contain_length", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "chess_puzzle_single_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "constrained_generation_multi_contain_position_only", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "panel_images_single_question", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "movie_info_retrieval", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "panel_images_multi_question", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "exchange_rate_estimate_plot", + "score": 0.9743499999999999, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_checkmate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_chordless_cycle", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_equality", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "av_multicamera_tracking_predict_bbox", + "score": 0.02306400619990589, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "9-image or more" + }, + { + "name": "chess_puzzles_crushing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "mvsa_sentiment_classification", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "table_understanding_fact_verification", + "score": 0.9285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "math_breakpoint", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "table_understanding_complex_question_answering", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ili_ratio_future_prediction", + "score": 0.23921428571428613, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "number_puzzle_sudoku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "dvqa", + "score": 0.9473684210526315, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "number_puzzle_kakuro_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geometry_solid", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "tqa_textbook_qa", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "pictionary_chinese_food_img2en", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "question_solution_solving", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "pictionary_skribbl_io", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_genai_output_chinese", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "science_basic_physics", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_article_journal", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "pictionary_doodle_guess", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_skill_plain", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "pictionary_cartoon_drawing_guess", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_equation", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_swap", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_latex", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_veracity", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ocr_resume_experience_plain", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_out_of_context", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ocr_resume_employer_plain", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_background_change", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ocr_math_text_latex", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_clip_stable_diffusion_generate", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_markdown", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_style", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ocr_resume_school_plain", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_entity_replace", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_csv", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_indian_celebrity", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_attribute_edit", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "memorization_chinese_celebrity", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_famous_treaty", + "score": 0.5357142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_leetcode", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "memorization_papers", + "score": 0.5666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_instagram", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_iphone_settings", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "ocr_math_MATH", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_ppt", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "ocr_math_TheoremQA", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_notes", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "MMSoc_HatefulMemes", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_amazon", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "MMSoc_Misinformation_PolitiFact", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_excel", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "MMSoc_Misinformation_GossipCop", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Object Recognition and Classification", + "Scene and Event Understanding", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_youtube", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "MMSoc_Memotion", + "score": 0.5529411764705883, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_twitter", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_layout_understanding_leetcode", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_alipay", + "score": 0.7058823529411765, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_layout_understanding_instagram", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_iphone_settings", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_zoom", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_layout_understanding_ppt", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_word", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_layout_understanding_notes", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_amazon", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_tiktok", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_layout_understanding_excel", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "crossword_mini_5x5", + "score": 0.6928571428571428, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_youtube", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ball_cup_swap_3", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_layout_understanding_twitter", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_alipay", + "score": 0.8235294117647058, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "autorater_3d_model_texturing", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Metrics", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "app_layout_understanding_zoom", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "autorater_aesthetics", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_artifact_reason", + "score": 0.6, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_word", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "autorater_subject", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "app_layout_understanding_tiktok", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "maze_2d_8x8", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "autorater_motion_guided_editing", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "autorater_unmask", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_semantics", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "video_motion_matching_3D_real", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "autorater_mask", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_control", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "video_motion_matching_real_3D", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "autorater_artifact", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "visual_prediction_rater_depth_estimation", + "score": 0.5952380952380951, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "counterfactual_arithmetic", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic_alliteration", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "visual_prediction_rater_plane_segmentation", + "score": 0.48888888888888893, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "poetry_shakespearean_sonnet", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "visual_prediction_rater_openable_part_segmentation", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "poetry_haiku", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_petrarchian_sonnet_optional_meter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "visual_prediction_rater_panoptic_segmentation", + "score": 0.8095238095238095, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "poetry_acrostic", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_limerick", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "visual_prediction_rater_surface_normal_estimation", + "score": 0.6666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "poetry_custom_rhyming_scheme", + "score": 0.4, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "visual_prediction_rater_3d_assembled_quality_understanding", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "shape_composition_shapes", + "score": 0.5210884353741496, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "visual_prediction_rater_novel_view_synthesis", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "shape_composition_colours", + "score": 0.47066326530612246, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_article_authors", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "visual_prediction_rater_semantic_segmentation", + "score": 0.45833333333333326, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "cvbench_adapted_cvbench_relation", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_html", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_distance", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_depth", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_count", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "symbolic_graphics_programs_scalable_vector_graphics", + "score": 0.1111111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "symbolic_graphics_programs_computer_aided_design", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_africa", + "score": 0.7428571428571428, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_europe", + "score": 0.6285714285714287, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_asia", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_americas", + "score": 0.6285714285714287, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "geographic_remote_sensing_land_cover", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "ascii_art_30", + "score": 0.5, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "table2latex_complex", + "score": 0.6888888888888888, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "meme_explain", + "score": 0.8642857142857142, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "sceneqa_scene_transition_video", + "score": 0.37857142857142856, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "video_summary", + "score": 0.7, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "funqa_unexpected_action_magic_video", + "score": 0.6133333333333333, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "paper_review_writing", + "score": 0.6133333333333333, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "4-5 images" + }, + { + "name": "activitynetqa", + "score": 0.4, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "scibench_w_solution_open_ended", + "score": 0.282, + "eval_type": "llm", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "generated_video_artifacts", + "score": 0.27499999999999997, + "eval_type": "llm", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "funny_image_title", + "score": 0.6357142857142858, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "wikihow_complex_task_completion", + "score": 0.8777777777777779, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "9-image or more" + }, + { + "name": "video_detail_description", + "score": 0.42105263157894735, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "funqa_unexpected_action_creative_video", + "score": 0.38666666666666666, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "guess_image_generation_prompt", + "score": 0.8578947368421055, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "traffic_accident_analysis", + "score": 0.7142857142857144, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_qa", + "score": 0.892857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "image_humor_understanding", + "score": 0.9103448275862066, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "vibe-eval", + "score": 0.6000000000000002, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "nextqa_oe", + "score": 0.3631578947368421, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "graph_interpretation", + "score": 0.8620689655172412, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "science_figure_explanation", + "score": 0.8586206896551724, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "ocrqa", + "score": 0.893103448275862, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_short_title", + "score": 0.8142857142857144, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "video2notes", + "score": 0.692857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "electrocardiogram", + "score": 0.43571428571428567, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "GUI_Chat_Easy", + "score": 0.7038461538461539, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "doc_vqa", + "score": 0.9249999999999999, + "eval_type": "llm", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "4-5 images" + }, + { + "name": "docci_image_description_long", + "score": 0.7928571428571429, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "humor_explanation", + "score": 0.7266666666666666, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_content_follow_up", + "score": 0.8428571428571431, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Planning and Decision Making" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "GUI_Chat_Hard", + "score": 0.4580645161290323, + "eval_type": "llm", + "num_demo": 1, + "num_query": 31, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "iq_test", + "score": 0.6931034482758621, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "unusual_images", + "score": 0.8344827586206894, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "tweets_captioning", + "score": 0.7071428571428572, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "art_explanation", + "score": 0.8379310344827589, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bar_chart_interpretation", + "score": 0.7413793103448276, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funqa_unexpected_action_humor_video", + "score": 0.2866666666666667, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "figurative_speech_explanation", + "score": 0.786206896551724, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "defeasible_reasoning", + "score": 0.8379310344827587, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "image_captioning_with_additional_requirements", + "score": 0.942857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "visualization_with_code", + "score": 0.6642857142857144, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "table_understanding_fetaqa", + "score": 0.7785714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_jailbreak", + "score": 0.9650000000000001, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Text Recognition (OCR)", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_celebrity", + "score": 0.825, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_captcha", + "score": 0.10000000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visual_order_B", + "score": 0.905263157894737, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "red_teaming_politics", + "score": 0.7699999999999999, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visual_order_A", + "score": 0.8789473684210528, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "red_teaming_racial", + "score": 0.8350000000000002, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visualmisleading", + "score": 0.8842105263157897, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_worldclass", + "score": 0.29999999999999993, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_advanced", + "score": 0.32142857142857145, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_expert", + "score": 0.35, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Spanish", + "score": 0.5428571428571429, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Russian", + "score": 0.5785714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Arabic", + "score": 0.65, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Chinese", + "score": 0.6285714285714284, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_French", + "score": 0.5857142857142856, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Spanish", + "score": 0.607142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_English", + "score": 0.6214285714285713, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Russian", + "score": 0.49999999999999994, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Arabic", + "score": 0.6071428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Japanese", + "score": 0.5214285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_French", + "score": 0.5928571428571427, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + } +] \ No newline at end of file diff --git a/static/eval_results/Default/GPT_4o/summary_results.json b/static/eval_results/Default/GPT_4o/summary_results.json new file mode 100644 index 0000000000000000000000000000000000000000..6af57dc0f78b6677c89cf6d73a5396b2d10b16f8 --- /dev/null +++ b/static/eval_results/Default/GPT_4o/summary_results.json @@ -0,0 +1,251 @@ +{ + "model_summary": { + "core": { + "num_eval_tasks": 440, + "num_eval_samples": 6539, + "macro_mean_score": 0.5265030595065238, + "micro_mean_score": 0.5236338521693411 + }, + "open": { + "num_eval_tasks": 65, + "num_eval_samples": 1163, + "macro_mean_score": 0.6478225794744895, + "micro_mean_score": 0.665391229578676 + }, + "overall_score": 0.5421184432647768 + }, + "keyword_stats": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.5630758211022604 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.6216411634729735 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.616018277142757 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.5823101249498799 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.44177544539510955 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.6345458069232931 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.6795263157894738 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.5514924675940659 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.39435038953269674 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.22934807257231926 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.608083455060831 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.491325251564869 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.4999089647103332 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.5315979872161023 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.5641404607063637 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.5613545677222056 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.47760591698367955 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.5388690453811203 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.48037685656449847 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.5994159671881645 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.44606605087301393 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.6274371950293718 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.5448877153826162 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.4751133786848073 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.5343350103400748 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.5672657028463585 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.5315979872161023 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.4500928191484624 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.4908653289106883 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.7056027785545881 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.33202130899313653 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.5032849161169843 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.5510350848991218 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.6095778863474799 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.5283797185155754 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.6135723164021851 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.44047720383044436 + } + } + } +} \ No newline at end of file diff --git a/static/eval_results/Default/GPT_4o/task_results.json b/static/eval_results/Default/GPT_4o/task_results.json new file mode 100644 index 0000000000000000000000000000000000000000..59a8732af607782d6ba09b3aa9592e8facf3ad7c --- /dev/null +++ b/static/eval_results/Default/GPT_4o/task_results.json @@ -0,0 +1,7756 @@ +[ + { + "name": "code_translation_Python", + "score": 0.6458333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "vln_identify_robot", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "9-image or more" + }, + { + "name": "google_streetview_line_sorting", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "dish_ingredient_match", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "vln_identify_location", + "score": 0.32727272727272727, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "video_eval_visual_pref", + "score": 0.625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "image_translation_en2cn", + "score": 0.5564421945052599, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "CLEVRER_physics", + "score": 0.45, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_find_odd_one_out", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "media_homepage_profile", + "score": 0.24717887154861945, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "TRANCE_physics_reasoning_basic", + "score": 0.8823529411764706, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "geometry_reasoning_overlapped_circle", + "score": 0.75, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_fit_pattern", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Mobile_tap", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "knowledge_sign_recognition", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "relative_depth_of_different_points", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "autonomous_driving_scene_analysis", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "booking_web_rating", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "geometry_reasoning_count_line_intersections", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ishihara_test", + "score": 0.5571428571428572, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "llavaguard", + "score": 0.6071428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "knowledge_graph_understanding", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "medical_content_based_retrieval_radiology", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "recover_masked_word_in_figure", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "photoshop_operation", + "score": 0.39642857142857146, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "multilingual_news_qa", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "code_translation_hard", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "rocks_samples_identify", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "paper_vqa", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "game_platform_support_identification", + "score": 0.8928571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "product_ocr_qa", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_visual_blocksworld", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "healthcare_info_judgement", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Ethical and Safety Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "planning_visual_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "pokemon_3D_recognition", + "score": 0.9, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "visual_correspondance_in_two_images", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "worldle", + "score": 0.5019920337942146, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "video_segments_reordering", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "structured_output", + "num_input": "video" + }, + { + "name": "circuit_diagram_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "media_recommend_solutions_stackoverflow", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "planning_screenshot_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "counting", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "TV_show_retrieval_by_character", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "planning_visual_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "signage_navigation", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "multiview_reasoning_camera_moving", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "music_sheet_note_count", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "vln_english_next_step", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "code_match_problem", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "sign_language", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "Ad_count_detection", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "2d_image_jigsaw_puzzle_easy", + "score": 0.3107142857142858, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "code_translation_advanced", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "flowchart_code_generation", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "song_title_identification_from_lyrics", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "google_streetview_circle_reasoning", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "media_QA_web_stackoverflow", + "score": 0.619047619047619, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_grounding_spatial", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "move_pos_to_pos_hanoi_4_pole", + "score": 0.010912698412698412, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "MMMU_pro_exam_screenshot", + "score": 0.5050505050505051, + "eval_type": "rule", + "num_demo": 1, + "num_query": 99, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_grid", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "location_vqa", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "TRANCE_physics_reasoning_event", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "medical_polyp_segmentation_single_object_rater", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "geometry_reasoning_circled_letter", + "score": 0.6071428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "monthly_weather_days_count", + "score": 0.3571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "transit_map_intersection_points", + "score": 0.5625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_solution_compare", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "webpage_code_understanding", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "functionality_matching_in_different_objects", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "google_streetview_direction_understanding", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "planning_visual_storage", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_nested_squares", + "score": 0.4642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Web_Multi", + "score": 0.4379245788668292, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "mensa_iq_test", + "score": 0.40294117647058825, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_find_legal_moves", + "score": 0.1858388265990491, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "music_sheet_sentiment", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "paper_review_rating", + "score": 0.6370339174257883, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "numerical_data", + "num_input": "4-5 images" + }, + { + "name": "play_go_capture_stone", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "topological_sort", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_multi_organ_segmentation_rater", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "Forensic_Detection_of_different_images", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "mindmap_elements_parsing", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_grippers", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "interpret_force_perspective_illusion", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "counting_multi_image", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "chinese_idiom_recognition", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "video_action_recognition", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "structured_output", + "num_input": "video" + }, + { + "name": "GUI_Act_Mobile_swipe", + "score": 0.5925323909834338, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "road_map_find_highway_between_two_place", + "score": 0.6470588235294118, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "annoying_word_search", + "score": 0.0009041591320072332, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_translation_easy", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "position_relationship", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "google_streetview_line_reasoning", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "booking_web_recommendation", + "score": 0.7803571428571426, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "polygon_interior_angles", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "music_sheet_author", + "score": 0.375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "ancient_map_understanding", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "font_recognition", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "rocks_samples_compare", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "top_video_creator_identification", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "Bongard_Problem", + "score": 0.21052631578947367, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_2D_views_of_3D_shapes", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "vln_tegulu_next_step", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "video_camera_motion_description", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "exact_text", + "num_input": "video" + }, + { + "name": "ascii_art_understanding", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Web_Single", + "score": 0.01601312748867357, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "video_grounding_temporal", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "planning_screenshot_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "actor_recognition_in_Movie", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "video_intent_recognition", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "extract_webpage_headline", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "hashtag_recommendation", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "calendar_schedule_suggestion", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "relative_reflectance_of_different_regions", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "web_action_prediction", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "highest_discount_game_price_identification", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "video_eval_factual_pref", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "planning_screenshot_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_execution", + "score": 0.75, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_blocksworld", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "music_sheet_format_QA", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "soccer_offside", + "score": 0.2222222222222222, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_physics", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "waldo", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "coco_ood_global_image_retrieval_by_query_property", + "score": 0.8583333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "number_comparison", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "medical_keywords_based_retrieval_non_radiology", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "multilingual_movie_info_parsing", + "score": 0.6632653061224488, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "paper_review_acceptance", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "cheapest_flight_identification", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "electricity_load_estimate_plot", + "score": 0.4767857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_storage", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "pmc_vqa_medical_image_qa", + "score": 0.8947368421052632, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "video_to_camera_trajectory_retrieval", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "movie_info_parsing", + "score": 0.6875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "scibench_calculus_wo_solution", + "score": 0.3673469387755102, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "medical_parasite_detection", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "code_programming_test_easy", + "score": 0.4583333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 24, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "mahjong", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_visualization_output_understanding", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 10, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "chess_sygyzy_endgames", + "score": 0.1496598639455782, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "Movie_retrieval_by_actor", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "perception_test_object_shuffle_video", + "score": 0.5625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "rebus", + "score": 0.6956521739130435, + "eval_type": "rule", + "num_demo": 1, + "num_query": 23, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "insect_order_classification", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "face_identity_matching", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "4-5 images" + }, + { + "name": "comic_page_ordering", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "graph_shortest_path_kamada_kawai", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "LaTeX_complex_formula_convertion", + "score": 0.5294117647058824, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "places365_similar_scene_retrieval", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "icon_arithmetic_puzzle", + "score": 0.5357142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "multi_load_type_prediction_from_plot", + "score": 0.4404761904761905, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "video_eval_dynamic_pref", + "score": 0.75, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "human_relationship_reasoning", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Commonsense and Social Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "orchestra_score_recognition", + "score": 0.32142857142857145, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "web_action_grounding", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_2d_folding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "muma_theory_of_mind_social_goal", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "vln_hindi_next_step", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "graph_shortest_path_planar", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "iconqa", + "score": 0.2631578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "remaining_playback_time_calculation", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "music_sheet_name", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "stock_info_parsing", + "score": 0.9747899159663866, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_retrieval", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "stock_price_future_prediction", + "score": 0.7872142857142859, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "weather_map_climate_type_temperature_parsing", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "clevrer_moving_direction_video", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "geometry_length", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "3d_fragments_understanding", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "vizwiz_quality_accessment_for_blind", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "traffic_future_prediction_from_line_plot", + "score": 0.6918947368421055, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_biology", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "planning_visual_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "recipe_image_ordering", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "distinguish_ai_generated_image", + "score": 0.6842105263157895, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "muma_theory_of_mind_belief_of_goal", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "animal_pose_estimation", + "score": 0.2785198065092178, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "weather_info_retrieval", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "average_humidity_estimate_plot", + "score": 0.828, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_tyreworld", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "vlnqa_egocentric_navigation_video", + "score": 0.5625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "music_info_parsing", + "score": 0.75, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "google_streetview_circle_sorting", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "code_programming_test_hard", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "semantic_matching_of_two_images", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "nlvr2_two_image_compare_qa", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "TRANCE_physics_reasoning_view", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "geometry_transformation", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "entertainment_web_game_style", + "score": 0.8214285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "multilingual_game_info_parsing", + "score": 0.8303571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "egocentric_analysis_single_image", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "landmark_recognition_and_qa", + "score": 0.711111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "action_sequence", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "9-image or more" + }, + { + "name": "graph_connectivity", + "score": 0.95, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "youtube_video_info_parsing", + "score": 0.8095238095238095, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "physical_property_reasoning", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "av_vehicle_multiview_counting", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "6-8 images" + }, + { + "name": "arc_agi", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "IAM_line_ocr_and_locate", + "score": 0.6275228061577963, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "scibench_fundamental_wo_solution", + "score": 0.3469387755102041, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "license_plate_recognition", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_visual_grippers", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "science_basic_physics", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "av_human_multiview_counting", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "6-8 images" + }, + { + "name": "clevrer_object_existence_video", + "score": 0.5625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "face_keypoint_detection", + "score": 0.5982549376215841, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "famous_building_recognition", + "score": 0.875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "emotion_recognition", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "6-8 images" + }, + { + "name": "TV_show_info_parsing", + "score": 0.8253968253968255, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "music_info_retrieval", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "image_style_recognition", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "figureqa", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "graph_theory", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chess_winner_identification", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "electricity_plot_future_prediction", + "score": 0.7131684210526317, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "algebra", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "game_info_retrieval", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "mnist_pattern", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "clevrer_video_moving_object_count", + "score": 0.5238095238095238, + "eval_type": "rule", + "num_demo": 1, + "num_query": 21, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "video" + }, + { + "name": "math_convexity_value_estimation", + "score": 0.5867591836191252, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "map_diagram_qa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "deciphering_oracle_bone", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "graph_isomorphism", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "funsd_document_qa", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "landmark_check_two_images", + "score": 0.8000000000000002, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "newspaper_page_parse_and_count", + "score": 0.6444444444444445, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "weather_info_parsing", + "score": 0.9285714285714288, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "vibe_eval_short_phrase", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "signboard_identification", + "score": 0.8666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "game_info_parsing", + "score": 0.8766233766233764, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "nextqa_mc", + "score": 0.9473684210526315, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "cam_traj_to_video_selection", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "graph_hamiltonian_cycle", + "score": 0.37559523809523804, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "clevrer_video_moving_object_property_recognition", + "score": 0.5625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "stock_info_retrieval", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "long_string_letter_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geometry_analytic", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "long_string_number_recognition", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "action_prediction", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "waybill_number_sequence_extraction", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "egocentric_spatial_reasoning", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_error_line_identification", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "geometry_area", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "single_person_pose_estimation", + "score": 0.2903422951989705, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "av_view_identification", + "score": 0.32222222222222224, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "photo_sharing_image_retrieval", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "newspaper_ocr_in_query_box", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "quizlet_question_solving", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "chart_vqa", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "electricity_future_prediction_from_table", + "score": 0.7417368421052631, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "humor_understand_caption_match", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_programming_test_advanced", + "score": 0.05555555555555555, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "hotel_booking_confirmation_parsing", + "score": 0.7142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "sta_action_localization_video", + "score": 0.3125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "geometry_descriptive", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_programming_extremely_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Planning and Decision Making" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "coco_person_detection", + "score": 0.6477943776571286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_math", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "video_content_reasoning", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "graph_maxflow", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "places365_scene_type_classification", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "perception_test_video_character_order", + "score": 0.9375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "research_website_parsing_blogpost", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "coco_object_detection_by_query_property", + "score": 0.5807339650392197, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "3d_indoor_scene_text_bbox_prediction", + "score": 0.18559785992971775, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "top_rated_hotel_identification", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_publication", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "widerface_face_count_and_event_classification", + "score": 0.6071428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_homepage", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "math_parity", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "reward_models_I2T_reward", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "reward_models_T2I_reward", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "snli_ve_visual_entailment", + "score": 0.8666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "brand_logo_recognition_and_elaboration", + "score": 0.74, + "eval_type": "rule", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "logo2k_same_type_logo_retrieval", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "constrained_generation_contain_contain_images", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "clevr_arithmetic", + "score": 0.631578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_repeat_length", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "super_clevr", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "constrained_generation_multi_contain_repeat_position_only_length", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_length", + "score": 0.8, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "visualdial_visual_dialog_image_guessing", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "science_molecule_chemistry", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_position_images", + "score": 0.4, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_position_length", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_chemistry", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "graph_hamiltonian_path", + "score": 0.36607142857142866, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "constrained_generation_xor_images", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_multi_contain_repeat", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "perception_test_video_action_count", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "video" + }, + { + "name": "constrained_generation_contain_contain_length", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "star_object_interaction_video", + "score": 0.625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "constrained_generation_multi_contain_position_only", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "chess_puzzle_single_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "panel_images_single_question", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "panel_images_multi_question", + "score": 0.8333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "movie_info_retrieval", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "chess_puzzles_checkmate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "exchange_rate_estimate_plot", + "score": 0.9764785714285713, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_equality", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_chordless_cycle", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_crushing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "table_understanding_fact_verification", + "score": 0.9047619047619048, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "av_multicamera_tracking_predict_bbox", + "score": 0.038392686848233396, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "9-image or more" + }, + { + "name": "table_understanding_complex_question_answering", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "mvsa_sentiment_classification", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "math_breakpoint", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "number_puzzle_sudoku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ili_ratio_future_prediction", + "score": 0.12478571428571421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "dvqa", + "score": 0.3684210526315789, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "number_puzzle_kakuro_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_chinese_food_img2en", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geometry_solid", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "pictionary_skribbl_io", + "score": 0.55, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_genai_output_chinese", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_doodle_guess", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_cartoon_drawing_guess", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_swap", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_veracity", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_out_of_context", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "question_solution_solving", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_background_change", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_clip_stable_diffusion_generate", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_style", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_entity_replace", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_attribute_edit", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ocr_math_TheoremQA", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MMSoc_HatefulMemes", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_PolitiFact", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_leetcode", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "MMSoc_Misinformation_GossipCop", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Object Recognition and Classification", + "Scene and Event Understanding", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Memotion", + "score": 0.6000000000000002, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_leetcode", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_instagram", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_layout_understanding_instagram", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_iphone_settings", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_iphone_settings", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_layout_understanding_ppt", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_amazon", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_ppt", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_layout_understanding_excel", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_youtube", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_notes", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_layout_understanding_twitter", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_alipay", + "score": 0.7647058823529411, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_zoom", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_amazon", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_layout_understanding_word", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_tiktok", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_excel", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "maze_2d_8x8", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_youtube", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_twitter", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_alipay", + "score": 0.5882352941176471, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_zoom", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_word", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "visual_prediction_rater_depth_estimation", + "score": 0.738095238095238, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "app_interactive_operations_tiktok", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "crossword_mini_5x5", + "score": 0.6785714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "visual_prediction_rater_plane_segmentation", + "score": 0.31111111111111106, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "ball_cup_swap_3", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "visual_prediction_rater_openable_part_segmentation", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "autorater_3d_model_texturing", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Metrics", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_aesthetics", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_artifact_reason", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "visual_prediction_rater_panoptic_segmentation", + "score": 0.47619047619047616, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "autorater_subject", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_motion_guided_editing", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "autorater_unmask", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_semantics", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "visual_prediction_rater_surface_normal_estimation", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "autorater_mask", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "visual_prediction_rater_3d_assembled_quality_understanding", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "autorater_control", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_artifact", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "visual_prediction_rater_novel_view_synthesis", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "counterfactual_arithmetic", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic_alliteration", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_shakespearean_sonnet", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_haiku", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "visual_prediction_rater_semantic_segmentation", + "score": 0.6041666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "poetry_petrarchian_sonnet_optional_meter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_relation", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic", + "score": 0.8666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_limerick", + "score": 0.8, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_distance", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "poetry_custom_rhyming_scheme", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_depth", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_count", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "shape_composition_shapes", + "score": 0.4562925170068027, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "shape_composition_colours", + "score": 0.36553287981859406, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_article_authors", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_html", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_article_journal", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_skill_plain", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_equation", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "symbolic_graphics_programs_scalable_vector_graphics", + "score": 0.1111111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_latex", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_experience_plain", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_employer_plain", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_text_latex", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_markdown", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_school_plain", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_csv", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_indian_celebrity", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_chinese_celebrity", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_famous_treaty", + "score": 0.6785714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "symbolic_graphics_programs_computer_aided_design", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "memorization_papers", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_africa", + "score": 0.7428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_europe", + "score": 0.7571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_asia", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_MATH", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_americas", + "score": 0.5428571428571428, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "geographic_remote_sensing_land_cover", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "medical_abdomen_MRI_organ_recognition", + "score": 0.5297619047619048, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "medical_cell_recognition", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "medical_image_artifacts_indentification", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "medical_abdomen_endscopy_organ_recognition", + "score": 0.24404761904761904, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "medical_retrieval_given_surgeon_activity", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Videos", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "medical_counting_lymphocytes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "medical_blood_vessels_recognition", + "score": 0.6785714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_motion_matching_3D_real", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "video_motion_matching_real_3D", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "cultural_vqa", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "character_recognition_in_TV_shows", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_output_result", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "kvqa_knowledge_aware_qa", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "tqa_textbook_qa", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "3d_indoor_scene_text_bbox_selection", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_notes", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "table_understanding", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "painting_QA", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "MMMU_physics_chemistry_MCQ", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "arxiv_vqa", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "realworld_qa_en2cn", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_add_tag", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "handwritten_math_expression_extraction", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "red_teaming_jailbreak", + "score": 0.8949999999999999, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Text Recognition (OCR)", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_celebrity", + "score": 0.9, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_politics", + "score": 0.7250000000000002, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_racial", + "score": 0.765, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visualmisleading", + "score": 0.7578947368421054, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Spanish", + "score": 0.7142857142857144, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_English", + "score": 0.7214285714285715, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Russian", + "score": 0.6, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Arabic", + "score": 0.7071428571428572, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Japanese", + "score": 0.65, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_French", + "score": 0.6785714285714285, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Spanish", + "score": 0.45, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Russian", + "score": 0.5642857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Arabic", + "score": 0.5285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Chinese", + "score": 0.6214285714285713, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_French", + "score": 0.4214285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "ascii_art_30", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "table2latex_complex", + "score": 0.6777777777777777, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_captcha", + "score": 0.10000000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visual_order_A", + "score": 0.905263157894737, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "meme_explain", + "score": 0.9142857142857145, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_summary", + "score": 0.7357142857142855, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "funqa_unexpected_action_magic_video", + "score": 0.6333333333333332, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "paper_review_writing", + "score": 0.5533333333333333, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "4-5 images" + }, + { + "name": "activitynetqa", + "score": 0.7052631578947368, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "scibench_w_solution_open_ended", + "score": 0.36200000000000004, + "eval_type": "llm", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "generated_video_artifacts", + "score": 0.35, + "eval_type": "llm", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "funny_image_title", + "score": 0.6928571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "wikihow_complex_task_completion", + "score": 0.8666666666666667, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "9-image or more" + }, + { + "name": "video_detail_description", + "score": 0.5684210526315789, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "funqa_unexpected_action_creative_video", + "score": 0.2866666666666667, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "guess_image_generation_prompt", + "score": 0.8315789473684211, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "traffic_accident_analysis", + "score": 0.37857142857142856, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_qa", + "score": 0.8785714285714287, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "vibe-eval", + "score": 0.6642857142857144, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "nextqa_oe", + "score": 0.4421052631578948, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "video_short_title", + "score": 0.7357142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "video2notes", + "score": 0.7285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "electrocardiogram", + "score": 0.3, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "GUI_Chat_Easy", + "score": 0.7423076923076924, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "doc_vqa", + "score": 0.8250000000000001, + "eval_type": "llm", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "4-5 images" + }, + { + "name": "docci_image_description_long", + "score": 0.8428571428571429, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "humor_explanation", + "score": 0.8666666666666668, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_content_follow_up", + "score": 0.835714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Planning and Decision Making" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "GUI_Chat_Hard", + "score": 0.49354838709677434, + "eval_type": "llm", + "num_demo": 1, + "num_query": 31, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "tweets_captioning", + "score": 0.6, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funqa_unexpected_action_humor_video", + "score": 0.5533333333333332, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "image_captioning_with_additional_requirements", + "score": 0.9214285714285716, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "visualization_with_code", + "score": 0.6357142857142858, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "table_understanding_fetaqa", + "score": 0.6785714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visual_order_B", + "score": 0.905263157894737, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "bridge_strategies_worldclass", + "score": 0.2857142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_advanced", + "score": 0.2785714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_expert", + "score": 0.3571428571428572, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "image_humor_understanding", + "score": 0.8620689655172411, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "graph_interpretation", + "score": 0.8310344827586206, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "science_figure_explanation", + "score": 0.8793103448275862, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "ocrqa", + "score": 0.8689655172413793, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "iq_test", + "score": 0.7310344827586206, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "unusual_images", + "score": 0.9068965517241377, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "art_explanation", + "score": 0.6172413793103447, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bar_chart_interpretation", + "score": 0.627586206896552, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "figurative_speech_explanation", + "score": 0.8310344827586207, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "defeasible_reasoning", + "score": 0.8275862068965518, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "sceneqa_scene_transition_video", + "score": 0.2928571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "video" + } +] \ No newline at end of file diff --git a/static/eval_results/Default/GPT_4o_mini/summary_results.json b/static/eval_results/Default/GPT_4o_mini/summary_results.json new file mode 100644 index 0000000000000000000000000000000000000000..e4ea03c1e4c0224b18df4676d6f3f1b2bbef39af --- /dev/null +++ b/static/eval_results/Default/GPT_4o_mini/summary_results.json @@ -0,0 +1,251 @@ +{ + "model_summary": { + "core": { + "num_eval_tasks": 440, + "num_eval_samples": 6539, + "macro_mean_score": 0.40767494558789397, + "micro_mean_score": 0.40431644154143376 + }, + "open": { + "num_eval_tasks": 65, + "num_eval_samples": 1163, + "macro_mean_score": 0.586537827213665, + "micro_mean_score": 0.6133276010318144 + }, + "overall_score": 0.43069690064863675 + }, + "keyword_stats": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.4492982787524939 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.49026056071002017 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.5168957112681365 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.46731791428406805 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.3406008235342885 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.5572925295284307 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.6902380952380953 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.4189154010048976 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.2943206715105082 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.19422793560945503 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.47202628409684394 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.3624496929166193 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.38946844562183286 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.45508480503584553 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.47569921440672464 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.465175334092545 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.29410984789062117 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.41242028190533997 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.3906415365938764 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.44244772638735347 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.3629944944697668 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.5713834131825314 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.39874839531459466 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.3359977324263039 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.4305788513381019 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.46343334374251277 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.45508480503584553 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.24651576711552803 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.36981497185070983 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.5666618234843734 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.2420320329702607 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.3458483931206892 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.43590838051817093 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.5176671720617656 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.3554299482098288 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.5399167524341886 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.32918280841495845 + } + } + } +} \ No newline at end of file diff --git a/static/eval_results/Default/GPT_4o_mini/task_results.json b/static/eval_results/Default/GPT_4o_mini/task_results.json new file mode 100644 index 0000000000000000000000000000000000000000..ac1f7d6a1eed9c18f56a4275b0b839ca7f3f7d7b --- /dev/null +++ b/static/eval_results/Default/GPT_4o_mini/task_results.json @@ -0,0 +1,7756 @@ +[ + { + "name": "planning_visual_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "relative_reflectance_of_different_regions", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "transit_map_intersection_points", + "score": 0.4404761904761905, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_grid", + "score": 0.39285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_overlapped_circle", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "webpage_code_understanding", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Mobile_tap", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_find_odd_one_out", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Web_Single", + "score": 0.02971437714058806, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "entertainment_web_game_style", + "score": 0.75, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ascii_art_understanding", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "code_retrieval", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "music_sheet_format_QA", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "medical_content_based_retrieval_radiology", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "scibench_fundamental_wo_solution", + "score": 0.2653061224489796, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_tyreworld", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chinese_idiom_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_visual_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "planning_screenshot_blocksworld", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_multi_organ_segmentation_rater", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "interpret_force_perspective_illusion", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "recover_masked_word_in_figure", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "vln_identify_location", + "score": 0.07878787878787878, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "rocks_samples_identify", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "geometry_reasoning_nested_squares", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "signage_navigation", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "circuit_diagram_understanding", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "visual_correspondance_in_two_images", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "product_ocr_qa", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "2d_image_jigsaw_puzzle_easy", + "score": 0.380952380952381, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "soccer_offside", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "distinguish_ai_generated_image", + "score": 0.631578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "web_action_grounding", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "media_recommend_solutions_stackoverflow", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "song_title_identification_from_lyrics", + "score": 0.6785714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "mindmap_elements_parsing", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "worldle", + "score": 0.35741427136457926, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "rebus", + "score": 0.30434782608695654, + "eval_type": "rule", + "num_demo": 1, + "num_query": 23, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "photoshop_operation", + "score": 0.24047619047619048, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "road_map_find_highway_between_two_place", + "score": 0.6470588235294118, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pokemon_3D_recognition", + "score": 0.43333333333333335, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "code_translation_Python", + "score": 0.3125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "Forensic_Detection_of_different_images", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "logical_reasoning_fit_pattern", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "hashtag_recommendation", + "score": 0.9119047619047619, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "rocks_samples_compare", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "extract_webpage_headline", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Web_Multi", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "paper_review_rating", + "score": 0.6370339174257883, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "numerical_data", + "num_input": "4-5 images" + }, + { + "name": "functionality_matching_in_different_objects", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "relative_depth_of_different_points", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "booking_web_recommendation", + "score": 0.7058673469387756, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "ancient_map_understanding", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "mensa_iq_test", + "score": 0.3348039215686274, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "TRANCE_physics_reasoning_view", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "code_translation_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "paper_vqa", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "counting", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "llavaguard", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "weather_map_climate_type_temperature_parsing", + "score": 0.8214285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "media_QA_web_stackoverflow", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_intent_recognition", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "location_vqa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "remaining_playback_time_calculation", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "monthly_weather_days_count", + "score": 0.49999999999999994, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "music_sheet_sentiment", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "video_grounding_temporal", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "TRANCE_physics_reasoning_event", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "video_eval_dynamic_pref", + "score": 0.4375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "icon_arithmetic_puzzle", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_segments_reordering", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "structured_output", + "num_input": "video" + }, + { + "name": "topological_sort", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "dish_ingredient_match", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "game_platform_support_identification", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "mahjong", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "video_grounding_spatial", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "medical_parasite_detection", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "knowledge_sign_recognition", + "score": 0.1111111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "geometry_reasoning_count_line_intersections", + "score": 0.39285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_circled_letter", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "autorater_artifact_reason", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_employer_plain", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_article_journal", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "autorater_semantics", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "logical_reasoning_2d_folding", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "ocr_resume_experience_plain", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_equation", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "autorater_control", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "planning_screenshot_storage", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ishihara_test", + "score": 0.5857142857142856, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_skill_plain", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "planning_visual_blocksworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "panel_images_single_question", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_markdown", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "autorater_mask", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "code_solution_compare", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "video_camera_motion_description", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "exact_text", + "num_input": "video" + }, + { + "name": "orchestra_score_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_html", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "font_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_text_latex", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "panel_images_multi_question", + "score": 0.6904761904761906, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "poetry_shakespearean_sonnet", + "score": 0.4, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "code_visualization_output_understanding", + "score": 0.3, + "eval_type": "rule", + "num_demo": 1, + "num_query": 10, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "poetry_haiku", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_latex", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_word", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "poetry_custom_rhyming_scheme", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "number_puzzle_kakuro_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "video_motion_matching_3D_real", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "ocr_article_authors", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "paper_review_acceptance", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "memorization_famous_treaty", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "media_homepage_profile", + "score": 0.18482142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "pictionary_genai_output_chinese", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "number_puzzle_sudoku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "crossword_mini_5x5", + "score": 0.48571428571428577, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_chinese_celebrity", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_twitter", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Mobile_swipe", + "score": 0.43979842890651355, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_indian_celebrity", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_csv", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "pictionary_cartoon_drawing_guess", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "flowchart_code_generation", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "reward_models_T2I_reward", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "planning_screenshot_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_school_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "pictionary_skribbl_io", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "Ad_count_detection", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "memorization_papers", + "score": 0.36666666666666664, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "reward_models_I2T_reward", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "calendar_schedule_suggestion", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "pictionary_doodle_guess", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "CLEVRER_physics", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_ppt", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "comic_page_ordering", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "poetry_petrarchian_sonnet_optional_meter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "table_understanding_fact_verification", + "score": 0.7857142857142856, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "poetry_limerick", + "score": 1.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "pictionary_chinese_food_img2en", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_homepage", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_execution", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_alipay", + "score": 0.5294117647058824, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_blogpost", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "counterfactual_arithmetic", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_publication", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic_alliteration", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "actor_recognition_in_Movie", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "video_action_recognition", + "score": 0.8214285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "structured_output", + "num_input": "video" + }, + { + "name": "constrained_generation_multi_contain_position_only", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "medical_polyp_segmentation_single_object_rater", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "MFC_Bench_check_clip_stable_diffusion_generate", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_tiktok", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_excel", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_background_change", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_position_images", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "MFC_Bench_check_face_attribute_edit", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_youtube", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_length", + "score": 0.8666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "MMMU_pro_exam_screenshot", + "score": 0.29292929292929293, + "eval_type": "rule", + "num_demo": 1, + "num_query": 99, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_out_of_context", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_swap", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "polygon_interior_angles", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_style", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "scibench_calculus_wo_solution", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "ocr_math_MATH", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "constrained_generation_xor_images", + "score": 0.6, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "app_layout_understanding_amazon", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_entity_replace", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_HatefulMemes", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_math", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Memotion", + "score": 0.5882352941176472, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "super_clevr", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_word", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_TheoremQA", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_PolitiFact", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "recipe_image_ordering", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "MMSoc_Misinformation_GossipCop", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Object Recognition and Classification", + "Scene and Event Understanding", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "medical_keywords_based_retrieval_non_radiology", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "video_motion_matching_real_3D", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "shape_composition_shapes", + "score": 0.22491496598639452, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "license_plate_recognition", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "knowledge_graph_understanding", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "newspaper_ocr_in_query_box", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "waldo", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "app_layout_understanding_instagram", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "shape_composition_colours", + "score": 0.2505668934240363, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "sign_language", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "chess_puzzles_checkmate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "TRANCE_physics_reasoning_basic", + "score": 0.7058823529411765, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "Bongard_Problem", + "score": 0.17982456140350878, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "chess_find_legal_moves", + "score": 0.048713528589567665, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "web_action_prediction", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_amazon", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "3d_indoor_scene_text_bbox_prediction", + "score": 0.024564069093751337, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_crushing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "dvqa", + "score": 0.47368421052631576, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_equality", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "multilingual_movie_info_parsing", + "score": 0.6020408163265306, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "IAM_line_ocr_and_locate", + "score": 0.6265067061623183, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "counting_multi_image", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "app_layout_understanding_zoom", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "deciphering_oracle_bone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "insect_order_classification", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "logo2k_same_type_logo_retrieval", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "vibe_eval_short_phrase", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "graph_shortest_path_planar", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "long_string_number_recognition", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "average_humidity_estimate_plot", + "score": 0.6480000000000001, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "table_understanding_complex_question_answering", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "stock_info_retrieval", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "ti_fused_vqa_chemistry", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "nlvr2_two_image_compare_qa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "math_breakpoint", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_iphone_settings", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "math_parity", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "video_eval_factual_pref", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "clevrer_moving_direction_video", + "score": 0.3125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "action_sequence", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "9-image or more" + }, + { + "name": "electricity_future_prediction_from_table", + "score": 0.7010526315789474, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "electricity_plot_future_prediction", + "score": 0.43662631578947375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_biology", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "question_solution_solving", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "graph_isomorphism", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_leetcode", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "landmark_check_two_images", + "score": 0.6444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "iconqa", + "score": 0.2631578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "multi_load_type_prediction_from_plot", + "score": 0.05952380952380952, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "quizlet_question_solving", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "clevrer_object_existence_video", + "score": 0.625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "brand_logo_recognition_and_elaboration", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "vlnqa_egocentric_navigation_video", + "score": 0.1875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "stock_price_future_prediction", + "score": 0.6955714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "widerface_face_count_and_event_classification", + "score": 0.6071428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ball_cup_swap_3", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "math_convexity_value_estimation", + "score": 0.457498007685276, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "figureqa", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "geometry_descriptive", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "science_molecule_chemistry", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "physical_property_reasoning", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "waybill_number_sequence_extraction", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "coco_object_detection_by_query_property", + "score": 0.5423192899685483, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "signboard_identification", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "sta_action_localization_video", + "score": 0.1875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "coco_ood_global_image_retrieval_by_query_property", + "score": 0.6488095238095237, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "graph_chordless_cycle", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "graph_hamiltonian_path", + "score": 0.21349206349206354, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "newspaper_page_parse_and_count", + "score": 0.3777777777777777, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "snli_ve_visual_entailment", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "face_keypoint_detection", + "score": 0.6498716440678927, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "funsd_document_qa", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "muma_theory_of_mind_belief_of_goal", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "nextqa_mc", + "score": 0.7368421052631579, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "electricity_load_estimate_plot", + "score": 0.5455714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "mnist_pattern", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "multilingual_news_qa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "multiple_states_identify_americas", + "score": 0.27142857142857146, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_programming_test_advanced", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "mvsa_sentiment_classification", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "movie_info_parsing", + "score": 0.6964285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "vizwiz_quality_accessment_for_blind", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "traffic_future_prediction_from_line_plot", + "score": 0.7275263157894736, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "code_programming_test_easy", + "score": 0.20833333333333334, + "eval_type": "rule", + "num_demo": 1, + "num_query": 24, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "autonomous_driving_scene_analysis", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "graph_theory", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "autorater_3d_model_texturing", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Metrics", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_contain_length", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "coco_person_detection", + "score": 0.5535393001296958, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geometry_analytic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "autorater_artifact", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "semantic_matching_of_two_images", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "autorater_unmask", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "play_go_capture_stone", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "arc_agi", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "constrained_generation_contain_contain_images", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "image_style_recognition", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "graph_hamiltonian_cycle", + "score": 0.20595238095238094, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "autorater_motion_guided_editing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "algebra", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "exchange_rate_estimate_plot", + "score": 0.9505142857142858, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "constrained_generation_multi_contain_repeat", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "famous_building_recognition", + "score": 0.9375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "music_info_parsing", + "score": 0.6875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "autorater_aesthetics", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "places365_similar_scene_retrieval", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "constrained_generation_multi_contain_repeat_position_only_length", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "autorater_subject", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "top_video_creator_identification", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_programming_test_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "constrained_generation_contain_repeat_length", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "clevrer_video_moving_object_property_recognition", + "score": 0.4375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "muma_theory_of_mind_social_goal", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "constrained_generation_contain_position_length", + "score": 0.8, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "science_basic_physics", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "3d_fragments_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "vln_english_next_step", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "emotion_recognition", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "6-8 images" + }, + { + "name": "geometry_transformation", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "animal_pose_estimation", + "score": 0.1969956173950675, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_2D_views_of_3D_shapes", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "move_pos_to_pos_hanoi_4_pole", + "score": 0.10714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "vln_identify_robot", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "9-image or more" + }, + { + "name": "multilingual_game_info_parsing", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chart_vqa", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_visual_storage", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "single_person_pose_estimation", + "score": 0.24388210678357394, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_connectivity", + "score": 0.4166666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "LaTeX_complex_formula_convertion", + "score": 0.29411764705882354, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "youtube_video_info_parsing", + "score": 0.75, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "map_diagram_qa", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "graph_maxflow", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "pmc_vqa_medical_image_qa", + "score": 0.5789473684210527, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "places365_scene_type_classification", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "av_multicamera_tracking_predict_bbox", + "score": 0.046820973422936174, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "9-image or more" + }, + { + "name": "landmark_recognition_and_qa", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "music_sheet_note_count", + "score": 0.058823529411764705, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "weather_info_retrieval", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "egocentric_analysis_single_image", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "google_streetview_direction_understanding", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "geometry_solid", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "perception_test_video_action_count", + "score": 0.375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "video" + }, + { + "name": "chess_winner_identification", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_translation_easy", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "visual_prediction_rater_novel_view_synthesis", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "symbolic_graphics_programs_scalable_vector_graphics", + "score": 0.1111111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "game_info_parsing", + "score": 0.8506493506493505, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "symbolic_graphics_programs_computer_aided_design", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "visual_prediction_rater_openable_part_segmentation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "cvbench_adapted_cvbench_relation", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "music_info_retrieval", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "cvbench_adapted_cvbench_distance", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "face_identity_matching", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "4-5 images" + }, + { + "name": "movie_info_retrieval", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "star_object_interaction_video", + "score": 0.6875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "cam_traj_to_video_selection", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "planning_screenshot_grippers", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_shortest_path_kamada_kawai", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "perception_test_object_shuffle_video", + "score": 0.3125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "cvbench_adapted_cvbench_count", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_physics", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "geometry_length", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "av_vehicle_multiview_counting", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "6-8 images" + }, + { + "name": "cvbench_adapted_cvbench_depth", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_error_line_identification", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "hotel_booking_confirmation_parsing", + "score": 0.6357142857142858, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "weather_info_parsing", + "score": 0.8769841269841271, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "egocentric_spatial_reasoning", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "stock_info_parsing", + "score": 0.9705882352941178, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "visual_prediction_rater_depth_estimation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "av_human_multiview_counting", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "6-8 images" + }, + { + "name": "clevrer_video_moving_object_count", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 21, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "video" + }, + { + "name": "geometry_area", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "cheapest_flight_identification", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "video_to_camera_trajectory_retrieval", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "multiple_states_identify_africa", + "score": 0.37142857142857144, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "human_relationship_reasoning", + "score": 0.8125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Commonsense and Social Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_instagram", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "multiple_states_identify_europe", + "score": 0.11428571428571428, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "humor_understand_caption_match", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "visual_prediction_rater_plane_segmentation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "multiple_states_identify_asia", + "score": 0.41428571428571426, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "visual_prediction_rater_semantic_segmentation", + "score": 0.3958333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "game_info_retrieval", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_excel", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "photo_sharing_image_retrieval", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "perception_test_video_character_order", + "score": 0.8125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "visual_prediction_rater_panoptic_segmentation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "chess_puzzle_single_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "visual_prediction_rater_3d_assembled_quality_understanding", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "av_view_identification", + "score": 0.14444444444444443, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "code_programming_extremely_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Planning and Decision Making" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "action_prediction", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "visual_prediction_rater_surface_normal_estimation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "app_interactive_operations_zoom", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_twitter", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "geographic_remote_sensing_land_cover", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "app_interactive_operations_iphone_settings", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_ppt", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_leetcode", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_alipay", + "score": 0.47058823529411764, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_tiktok", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_youtube", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_notes", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "google_streetview_circle_sorting", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "code_translation_advanced", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "planning_screenshot_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "booking_web_rating", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "position_relationship", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "number_comparison", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "image_translation_en2cn", + "score": 0.43050085804176885, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "music_sheet_name", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "highest_discount_game_price_identification", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "google_streetview_circle_reasoning", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "annoying_word_search", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "google_streetview_line_reasoning", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "multiview_reasoning_camera_moving", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "Movie_retrieval_by_actor", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "medical_image_artifacts_indentification", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "medical_retrieval_given_surgeon_activity", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Videos", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "medical_cell_recognition", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "medical_abdomen_endscopy_organ_recognition", + "score": 0.09523809523809523, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "maze_2d_8x8", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_veracity", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "healthcare_info_judgement", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Ethical and Safety Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "medical_counting_lymphocytes", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "code_match_problem", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "music_sheet_author", + "score": 0.1875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "chess_sygyzy_endgames", + "score": 0.05714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "long_string_letter_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_visual_grippers", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "ili_ratio_future_prediction", + "score": 0.004214285714285663, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "medical_abdomen_MRI_organ_recognition", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "planning_visual_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "TV_show_info_parsing", + "score": 0.7539682539682541, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_content_reasoning", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "top_rated_hotel_identification", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "vln_hindi_next_step", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "visualdial_visual_dialog_image_guessing", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "vln_tegulu_next_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "TV_show_retrieval_by_character", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "video_eval_visual_pref", + "score": 0.4375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "clevr_arithmetic", + "score": 0.3157894736842105, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "google_streetview_line_sorting", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "cultural_vqa", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "medical_blood_vessels_recognition", + "score": 0.5357142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "character_recognition_in_TV_shows", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_output_result", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "kvqa_knowledge_aware_qa", + "score": 0.10526315789473684, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "tqa_textbook_qa", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "arxiv_vqa", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "painting_QA", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "realworld_qa_en2cn", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MMMU_physics_chemistry_MCQ", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "handwritten_math_expression_extraction", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_add_tag", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "3d_indoor_scene_text_bbox_selection", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "table_understanding", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_notes", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "electrocardiogram", + "score": 0.2785714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "ascii_art_30", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "tweets_captioning", + "score": 0.6142857142857142, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "humor_explanation", + "score": 0.8200000000000001, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "doc_vqa", + "score": 0.8375000000000001, + "eval_type": "llm", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "4-5 images" + }, + { + "name": "docci_image_description_long", + "score": 0.7214285714285715, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funny_image_title", + "score": 0.65, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funqa_unexpected_action_magic_video", + "score": 0.5733333333333334, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "art_explanation", + "score": 0.6931034482758621, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "GUI_Chat_Easy", + "score": 0.7615384615384616, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "image_humor_understanding", + "score": 0.875862068965517, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "wikihow_complex_task_completion", + "score": 0.7777777777777779, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "9-image or more" + }, + { + "name": "bar_chart_interpretation", + "score": 0.5206896551724138, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "figurative_speech_explanation", + "score": 0.8379310344827584, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "meme_explain", + "score": 0.8857142857142858, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "unusual_images", + "score": 0.872413793103448, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "iq_test", + "score": 0.7206896551724139, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "science_figure_explanation", + "score": 0.8482758620689654, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "defeasible_reasoning", + "score": 0.817241379310345, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "graph_interpretation", + "score": 0.8379310344827586, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "image_captioning_with_additional_requirements", + "score": 0.9000000000000001, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video2notes", + "score": 0.6785714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "guess_image_generation_prompt", + "score": 0.8210526315789474, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "ocrqa", + "score": 0.7827586206896552, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "GUI_Chat_Hard", + "score": 0.4548387096774193, + "eval_type": "llm", + "num_demo": 1, + "num_query": 31, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "traffic_accident_analysis", + "score": 0.6857142857142858, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funqa_unexpected_action_creative_video", + "score": 0.31333333333333335, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "generated_video_artifacts", + "score": 0.43749999999999994, + "eval_type": "llm", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "vibe-eval", + "score": 0.5714285714285715, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funqa_unexpected_action_humor_video", + "score": 0.4066666666666666, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "table2latex_complex", + "score": 0.5666666666666667, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_qa", + "score": 0.6714285714285716, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "sceneqa_scene_transition_video", + "score": 0.3, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "red_teaming_celebrity", + "score": 0.8950000000000001, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visual_order_B", + "score": 0.905263157894737, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "red_teaming_visualmisleading", + "score": 0.8000000000000002, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "paper_review_writing", + "score": 0.5466666666666666, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "4-5 images" + }, + { + "name": "video_detail_description", + "score": 0.4263157894736842, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "red_teaming_captcha", + "score": 0.10000000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "activitynetqa", + "score": 0.4157894736842105, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "red_teaming_jailbreak", + "score": 0.8800000000000001, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Text Recognition (OCR)", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_French", + "score": 0.42857142857142855, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Japanese", + "score": 0.14285714285714285, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Arabic", + "score": 0.05714285714285715, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_racial", + "score": 0.7750000000000001, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_summary", + "score": 0.7357142857142858, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Russian", + "score": 0.2071428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Spanish", + "score": 0.16428571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "nextqa_oe", + "score": 0.3368421052631579, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "video_short_title", + "score": 0.6428571428571429, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "red_teaming_visual_order_A", + "score": 0.905263157894737, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "visualization_with_code", + "score": 0.5142857142857142, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_English", + "score": 0.3285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_expert", + "score": 0.33571428571428574, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Arabic", + "score": 0.41428571428571426, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Russian", + "score": 0.4642857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_French", + "score": 0.4857142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "red_teaming_politics", + "score": 0.72, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "table_understanding_fetaqa", + "score": 0.6, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "scibench_w_solution_open_ended", + "score": 0.3559999999999999, + "eval_type": "llm", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_advanced", + "score": 0.6571428571428573, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_worldclass", + "score": 0.5142857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Spanish", + "score": 0.4428571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Chinese", + "score": 0.5285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "video_content_follow_up", + "score": 0.8642857142857145, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Planning and Decision Making" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "video" + } +] \ No newline at end of file diff --git a/static/eval_results/Default/Gemini_1.5_flash_002/summary_results.json b/static/eval_results/Default/Gemini_1.5_flash_002/summary_results.json new file mode 100644 index 0000000000000000000000000000000000000000..89748aec2730a7b4dd7c3fbdec0e71c34ad210d5 --- /dev/null +++ b/static/eval_results/Default/Gemini_1.5_flash_002/summary_results.json @@ -0,0 +1,251 @@ +{ + "model_summary": { + "core": { + "num_eval_tasks": 440, + "num_eval_samples": 6539, + "macro_mean_score": 0.4189319021967416, + "micro_mean_score": 0.41567515414375245 + }, + "open": { + "num_eval_tasks": 65, + "num_eval_samples": 1163, + "macro_mean_score": 0.5691365176285039, + "micro_mean_score": 0.5987532244196045 + }, + "overall_score": 0.4382651695295427 + }, + "keyword_stats": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.46355333176347063 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.4431807648811706 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.4975887290434539 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.49409642663278297 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.38033540105052427 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.5621166766717235 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.6570726817042606 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.4480877005302385 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.3338006749329557 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.16197013296986068 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.3971534837718938 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.3448204918940882 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.43525833484767545 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.4837362543956792 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.5111257660425502 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.49366013155105076 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.4001983820478609 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.386988040250785 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.3884226428206387 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.4425893080900246 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.42223626366392253 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.5390305634303021 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.472066557554629 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.3666950113378685 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.44571360028283974 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.45400479933257654 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.4837362543956792 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.35161402777057993 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.3839609821519984 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.4822341581959653 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.26434115361219657 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.3677547363031234 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.4640301382180305 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.5348199655361041 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.4890240042560499 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.5126038207415967 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.384818434165593 + } + } + } +} \ No newline at end of file diff --git a/static/eval_results/Default/Gemini_1.5_flash_002/task_results.json b/static/eval_results/Default/Gemini_1.5_flash_002/task_results.json new file mode 100644 index 0000000000000000000000000000000000000000..584c3f67d4d2e17e705f05ab9648442c90fd96e8 --- /dev/null +++ b/static/eval_results/Default/Gemini_1.5_flash_002/task_results.json @@ -0,0 +1,7756 @@ +[ + { + "name": "monthly_weather_days_count", + "score": 0.3095238095238095, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_visual_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "geometry_reasoning_overlapped_circle", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_grid", + "score": 0.6785714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "webpage_code_understanding", + "score": 0.8888888888888888, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "planning_visual_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "transit_map_intersection_points", + "score": 0.4068877551020408, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_nested_squares", + "score": 0.4642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "scibench_fundamental_wo_solution", + "score": 0.3469387755102041, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Web_Single", + "score": 0.03886509470801488, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "remaining_playback_time_calculation", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_blocksworld", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "extract_webpage_headline", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "entertainment_web_game_style", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_find_odd_one_out", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "relative_reflectance_of_different_regions", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ascii_art_understanding", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "TRANCE_physics_reasoning_event", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "music_sheet_format_QA", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "recover_masked_word_in_figure", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "medical_content_based_retrieval_radiology", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "GUI_Act_Mobile_tap", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "dish_ingredient_match", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "code_retrieval", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "mahjong", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "web_action_grounding", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "TRANCE_physics_reasoning_view", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "planning_screenshot_tyreworld", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "rocks_samples_identify", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "counting", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "medical_parasite_detection", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "interpret_force_perspective_illusion", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chinese_idiom_recognition", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "2d_image_jigsaw_puzzle_easy", + "score": 0.14999999999999997, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "product_ocr_qa", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "knowledge_sign_recognition", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "medical_multi_organ_segmentation_rater", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "song_title_identification_from_lyrics", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "signage_navigation", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ishihara_test", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "booking_web_recommendation", + "score": 0.6203514739229025, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "visual_correspondance_in_two_images", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "rocks_samples_compare", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "pokemon_3D_recognition", + "score": 0.8333333333333334, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "worldle", + "score": 0.35558727927939476, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "orchestra_score_recognition", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_fit_pattern", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "distinguish_ai_generated_image", + "score": 0.8421052631578947, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ancient_map_understanding", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "weather_map_climate_type_temperature_parsing", + "score": 0.6785714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "soccer_offside", + "score": 0.1111111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "icon_arithmetic_puzzle", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_translation_Python", + "score": 0.41666666666666663, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "road_map_find_highway_between_two_place", + "score": 0.6470588235294118, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Web_Multi", + "score": 0.22271751659129607, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_translation_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "media_QA_web_stackoverflow", + "score": 0.5476190476190476, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "relative_depth_of_different_points", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "comic_page_ordering", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "code_execution", + "score": 0.4375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "paper_review_rating", + "score": 0.7558635964363686, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "numerical_data", + "num_input": "4-5 images" + }, + { + "name": "location_vqa", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "mensa_iq_test", + "score": 0.47990196078431374, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MMMU_pro_exam_screenshot", + "score": 0.2727272727272727, + "eval_type": "rule", + "num_demo": 1, + "num_query": 99, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "llavaguard", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_count_line_intersections", + "score": 0.32142857142857145, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_circled_letter", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_eval_dynamic_pref", + "score": 0.75, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "video_eval_factual_pref", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "vln_identify_location", + "score": 0.2303030303030303, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "video_segments_reordering", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "structured_output", + "num_input": "video" + }, + { + "name": "planning_screenshot_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "image_translation_en2cn", + "score": 0.3100359127375053, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "circuit_diagram_understanding", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "polygon_interior_angles", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_storage", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "music_sheet_name", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "code_solution_compare", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "logical_reasoning_2d_folding", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "font_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_visual_blocksworld", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "video_action_recognition", + "score": 0.8214285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "structured_output", + "num_input": "video" + }, + { + "name": "code_visualization_output_understanding", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 10, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "autorater_3d_model_texturing", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Metrics", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "google_streetview_line_reasoning", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "medical_polyp_segmentation_single_object_rater", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "media_recommend_solutions_stackoverflow", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "medical_keywords_based_retrieval_non_radiology", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "autorater_artifact", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "knowledge_graph_understanding", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "CLEVRER_physics", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "multiview_reasoning_camera_moving", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "TRANCE_physics_reasoning_basic", + "score": 0.47058823529411764, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "highest_discount_game_price_identification", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "chess_find_legal_moves", + "score": 0.0319296239070534, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "autonomous_driving_scene_analysis", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "autorater_artifact_reason", + "score": 0.6, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_grounding_spatial", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "Movie_retrieval_by_actor", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "autorater_unmask", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "ocr_article_journal", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "recipe_image_ordering", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "ocr_math_equation", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "autorater_motion_guided_editing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "google_streetview_circle_sorting", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "ocr_resume_skill_plain", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_markdown", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "photoshop_operation", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "autorater_semantics", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "ocr_table_to_html", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_translation_advanced", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "counting_multi_image", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "ocr_math_text_latex", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "video_intent_recognition", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "ocr_table_to_latex", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_employer_plain", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "music_sheet_sentiment", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "video_eval_visual_pref", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "autorater_aesthetics", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "planning_screenshot_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_experience_plain", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "autorater_control", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "topological_sort", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_article_authors", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "hashtag_recommendation", + "score": 0.9404761904761905, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_visual_grippers", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "semantic_matching_of_two_images", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "healthcare_info_judgement", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Ethical and Safety Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "game_platform_support_identification", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_word", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "booking_web_rating", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "position_relationship", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "number_comparison", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "autorater_subject", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "app_interactive_operations_amazon", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "sign_language", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "media_homepage_profile", + "score": 0.21282182729551152, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "pictionary_genai_output_chinese", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "top_video_creator_identification", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "autorater_mask", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "video_camera_motion_description", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "exact_text", + "num_input": "video" + }, + { + "name": "pictionary_cartoon_drawing_guess", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "number_puzzle_kakuro_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_skribbl_io", + "score": 0.15, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "Ad_count_detection", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "web_action_prediction", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_instagram", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "number_puzzle_sudoku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "pictionary_doodle_guess", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "google_streetview_circle_reasoning", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "poetry_shakespearean_sonnet", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "paper_review_acceptance", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "functionality_matching_in_different_objects", + "score": 0.4642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "ocr_table_to_csv", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_twitter", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_zoom", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "multilingual_news_qa", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "annoying_word_search", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Mobile_swipe", + "score": 0.49714178831993683, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "actor_recognition_in_Movie", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "flowchart_code_generation", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ocr_resume_school_plain", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "calendar_schedule_suggestion", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "table_understanding_fact_verification", + "score": 0.7261904761904762, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_ppt", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_homepage", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_iphone_settings", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "play_go_capture_stone", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_blogpost", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "pictionary_chinese_food_img2en", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_publication", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "poetry_haiku", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_tiktok", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_clip_stable_diffusion_generate", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "scibench_calculus_wo_solution", + "score": 0.30612244897959184, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_alipay", + "score": 0.23529411764705882, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "counterfactual_arithmetic", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_leetcode", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "3d_indoor_scene_text_bbox_prediction", + "score": 0.09826063389901919, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "stock_info_retrieval", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "app_layout_understanding_youtube", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "dvqa", + "score": 0.9473684210526315, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "license_plate_recognition", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_background_change", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_math", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "super_clevr", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "geometry_analytic", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_attribute_edit", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_amazon", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "arc_agi", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "multilingual_movie_info_parsing", + "score": 0.5408163265306122, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "newspaper_ocr_in_query_box", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_tiktok", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "poetry_custom_rhyming_scheme", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "insect_order_classification", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_word", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_out_of_context", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "long_string_number_recognition", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "coco_person_detection", + "score": 0.5916519873131821, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_youtube", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "chess_puzzles_checkmate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "vibe_eval_short_phrase", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "IAM_line_ocr_and_locate", + "score": 0.6702481953279147, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "image_style_recognition", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "memorization_famous_treaty", + "score": 0.6785714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_swap", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "constrained_generation_multi_contain_position_only", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "chess_puzzles_crushing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_notes", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_layout_understanding_instagram", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_chemistry", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_style", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_equality", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "music_info_parsing", + "score": 0.39285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "logo2k_same_type_logo_retrieval", + "score": 0.8214285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "MFC_Bench_check_text_entity_replace", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "deciphering_oracle_bone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_zoom", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_TheoremQA", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "memorization_chinese_celebrity", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "waldo", + "score": 0.0002062628914307136, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "app_layout_understanding_iphone_settings", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "signboard_identification", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "visualdial_visual_dialog_image_guessing", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "graph_shortest_path_planar", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "Bongard_Problem", + "score": 0.2894736842105263, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "memorization_indian_celebrity", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_chordless_cycle", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "clevr_arithmetic", + "score": 0.2631578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "average_humidity_estimate_plot", + "score": 0.5600000000000002, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_leetcode", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "graph_hamiltonian_cycle", + "score": 0.5089285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "memorization_papers", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "long_string_letter_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "face_keypoint_detection", + "score": 0.746390336033466, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "math_breakpoint", + "score": 0.8666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "nlvr2_two_image_compare_qa", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "iconqa", + "score": 0.3157894736842105, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "exchange_rate_estimate_plot", + "score": 0.9621285714285712, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "math_parity", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "algebra", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "quizlet_question_solving", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "electricity_future_prediction_from_table", + "score": 0.7057894736842105, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "places365_similar_scene_retrieval", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_layout_understanding_excel", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_biology", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "movie_info_retrieval", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "question_solution_solving", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "coco_ood_global_image_retrieval_by_query_property", + "score": 0.681547619047619, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "action_sequence", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "9-image or more" + }, + { + "name": "science_molecule_chemistry", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "graph_shortest_path_kamada_kawai", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "graph_isomorphism", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "geometry_length", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "clevrer_moving_direction_video", + "score": 0.125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "multi_load_type_prediction_from_plot", + "score": 0.5357142857142856, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "landmark_check_two_images", + "score": 0.7555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "av_vehicle_multiview_counting", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "6-8 images" + }, + { + "name": "brand_logo_recognition_and_elaboration", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "coco_object_detection_by_query_property", + "score": 0.5798723155227672, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "youtube_video_info_parsing", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "electricity_plot_future_prediction", + "score": 0.9017526315789473, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "figureqa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "stock_info_parsing", + "score": 0.7478991596638657, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "clevrer_video_moving_object_property_recognition", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "movie_info_parsing", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "landmark_recognition_and_qa", + "score": 0.5555555555555555, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "3d_fragments_understanding", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "widerface_face_count_and_event_classification", + "score": 0.5357142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "clevrer_object_existence_video", + "score": 0.375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "geometry_transformation", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "math_convexity_value_estimation", + "score": 0.570486129111546, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "stock_price_future_prediction", + "score": 0.7672857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multilingual_game_info_parsing", + "score": 0.4642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "vlnqa_egocentric_navigation_video", + "score": 0.3125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "geometry_descriptive", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "physical_property_reasoning", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "mnist_pattern", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "code_programming_test_advanced", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "multiple_states_identify_americas", + "score": 0.21428571428571433, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "traffic_future_prediction_from_line_plot", + "score": 0.7220526315789474, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "single_person_pose_estimation", + "score": 0.24564101770091742, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chart_vqa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "newspaper_page_parse_and_count", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_connectivity", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "symbolic_graphics_programs_scalable_vector_graphics", + "score": 0.1111111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "map_diagram_qa", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "graph_maxflow", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "symbolic_graphics_programs_computer_aided_design", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "places365_scene_type_classification", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geometry_solid", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "av_multicamera_tracking_predict_bbox", + "score": 0.0017402394162957552, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "9-image or more" + }, + { + "name": "cvbench_adapted_cvbench_relation", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ili_ratio_future_prediction", + "score": 0.11578571428571437, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "game_info_parsing", + "score": 0.7727272727272726, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "av_human_multiview_counting", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "6-8 images" + }, + { + "name": "game_info_retrieval", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "emotion_recognition", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "6-8 images" + }, + { + "name": "visual_prediction_rater_novel_view_synthesis", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "top_rated_hotel_identification", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_distance", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "av_view_identification", + "score": 0.2333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "visual_prediction_rater_openable_part_segmentation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "cheapest_flight_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "pmc_vqa_medical_image_qa", + "score": 0.5263157894736842, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_count", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "visual_prediction_rater_depth_estimation", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "cvbench_adapted_cvbench_depth", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "visual_prediction_rater_plane_segmentation", + "score": 0.28888888888888886, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_semantic_segmentation", + "score": 0.39583333333333326, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "video_to_camera_trajectory_retrieval", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "visual_prediction_rater_panoptic_segmentation", + "score": 0.2619047619047619, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "human_relationship_reasoning", + "score": 0.9375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Commonsense and Social Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "visual_prediction_rater_3d_assembled_quality_understanding", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "humor_understand_caption_match", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "vln_english_next_step", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "video_motion_matching_real_3D", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "logical_reasoning_2D_views_of_3D_shapes", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "move_pos_to_pos_hanoi_4_pole", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "cam_traj_to_video_selection", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "photo_sharing_image_retrieval", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "visual_prediction_rater_surface_normal_estimation", + "score": 0.33333333333333337, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "chess_puzzle_single_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geographic_remote_sensing_land_cover", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "weather_info_parsing", + "score": 0.7539682539682538, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "egocentric_spatial_reasoning", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "sta_action_localization_video", + "score": 0.3125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "google_streetview_direction_understanding", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "snli_ve_visual_entailment", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_translation_easy", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "code_match_problem", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "electricity_load_estimate_plot", + "score": 0.589357142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "muma_theory_of_mind_belief_of_goal", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "code_programming_test_easy", + "score": 0.1875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 24, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "perception_test_video_action_count", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "video" + }, + { + "name": "famous_building_recognition", + "score": 0.84375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_winner_identification", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_programming_test_hard", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "google_streetview_line_sorting", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "planning_visual_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "perception_test_object_shuffle_video", + "score": 0.4375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "hotel_booking_confirmation_parsing", + "score": 0.3928571428571428, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "perception_test_video_character_order", + "score": 0.875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "clevrer_video_moving_object_count", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 21, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "video" + }, + { + "name": "geometry_area", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "star_object_interaction_video", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "multiple_states_identify_africa", + "score": 0.19999999999999998, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_physics", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_europe", + "score": 0.15714285714285717, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_error_line_identification", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "multiple_states_identify_asia", + "score": 0.4714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "muma_theory_of_mind_social_goal", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "science_basic_physics", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_programming_extremely_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Planning and Decision Making" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "animal_pose_estimation", + "score": 0.24492301011444534, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "music_info_retrieval", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "face_identity_matching", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "4-5 images" + }, + { + "name": "action_prediction", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "nextqa_mc", + "score": 0.8947368421052632, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "mvsa_sentiment_classification", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "weather_info_retrieval", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "egocentric_analysis_single_image", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "video_content_reasoning", + "score": 0.8888888888888888, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "TV_show_info_parsing", + "score": 0.753968253968254, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "mindmap_elements_parsing", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "rebus", + "score": 0.30434782608695654, + "eval_type": "rule", + "num_demo": 1, + "num_query": 23, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "Forensic_Detection_of_different_images", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "video_grounding_temporal", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "paper_vqa", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "poetry_petrarchian_sonnet_optional_meter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic_alliteration", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "vln_identify_robot", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "9-image or more" + }, + { + "name": "poetry_limerick", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_excel", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "constrained_generation_contain_length", + "score": 0.4, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_contain_length", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "MMSoc_HatefulMemes", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_repeat_length", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_position_length", + "score": 0.8, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_multi_contain_repeat_position_only_length", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_position_images", + "score": 0.6, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "graph_theory", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "constrained_generation_xor_images", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_multi_contain_repeat", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_contain_images", + "score": 0.8666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "vizwiz_quality_accessment_for_blind", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "ball_cup_swap_3", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "vln_hindi_next_step", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "video_motion_matching_3D_real", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "chess_sygyzy_endgames", + "score": 0.07619047619047618, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "medical_counting_lymphocytes", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "LaTeX_complex_formula_convertion", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "TV_show_retrieval_by_character", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "medical_image_artifacts_indentification", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_grippers", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_visual_storage", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "music_sheet_author", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "music_sheet_note_count", + "score": 0.058823529411764705, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "medical_abdomen_MRI_organ_recognition", + "score": 0.27380952380952384, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "medical_cell_recognition", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_twitter", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "panel_images_single_question", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_ppt", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "panel_images_multi_question", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "crossword_mini_5x5", + "score": 0.35000000000000003, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_alipay", + "score": 0.4117647058823529, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "maze_2d_8x8", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "table_understanding_complex_question_answering", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "reward_models_I2T_reward", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "medical_retrieval_given_surgeon_activity", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Videos", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "shape_composition_shapes", + "score": 0.3137755102040816, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "shape_composition_colours", + "score": 0.2828798185941043, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_PolitiFact", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Memotion", + "score": 0.6000000000000001, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_veracity", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_GossipCop", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Object Recognition and Classification", + "Scene and Event Understanding", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "waybill_number_sequence_extraction", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_MATH", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "funsd_document_qa", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "medical_abdomen_endscopy_organ_recognition", + "score": 0.09523809523809523, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "reward_models_T2I_reward", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "graph_hamiltonian_path", + "score": 0.3654761904761905, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "vln_tegulu_next_step", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "medical_blood_vessels_recognition", + "score": 0.6785714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "cultural_vqa", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "character_recognition_in_TV_shows", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_output_result", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "kvqa_knowledge_aware_qa", + "score": 0.47368421052631576, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "tqa_textbook_qa", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "arxiv_vqa", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "painting_QA", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "realworld_qa_en2cn", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MMMU_physics_chemistry_MCQ", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_add_tag", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "handwritten_math_expression_extraction", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "3d_indoor_scene_text_bbox_selection", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_notes", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "table_understanding", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "ascii_art_30", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "table2latex_complex", + "score": 0.7666666666666668, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "meme_explain", + "score": 0.8571428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "generated_video_artifacts", + "score": 0.35624999999999996, + "eval_type": "llm", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "funny_image_title", + "score": 0.5928571428571429, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "wikihow_complex_task_completion", + "score": 0.8222222222222222, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "9-image or more" + }, + { + "name": "sceneqa_scene_transition_video", + "score": 0.3, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "video_summary", + "score": 0.6642857142857144, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "funqa_unexpected_action_magic_video", + "score": 0.5199999999999999, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "video_detail_description", + "score": 0.563157894736842, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "paper_review_writing", + "score": 0.6199999999999999, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "4-5 images" + }, + { + "name": "funqa_unexpected_action_creative_video", + "score": 0.3466666666666667, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "guess_image_generation_prompt", + "score": 0.8263157894736842, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "traffic_accident_analysis", + "score": 0.6214285714285716, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "activitynetqa", + "score": 0.5052631578947369, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "scibench_w_solution_open_ended", + "score": 0.45000000000000007, + "eval_type": "llm", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "vibe-eval", + "score": 0.6071428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_qa", + "score": 0.8571428571428573, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "image_humor_understanding", + "score": 0.9068965517241379, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "doc_vqa", + "score": 0.8187500000000001, + "eval_type": "llm", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "4-5 images" + }, + { + "name": "docci_image_description_long", + "score": 0.7642857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "humor_explanation", + "score": 0.8533333333333335, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "nextqa_oe", + "score": 0.32105263157894737, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "graph_interpretation", + "score": 0.789655172413793, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "science_figure_explanation", + "score": 0.8551724137931035, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "ocrqa", + "score": 0.7758620689655171, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bar_chart_interpretation", + "score": 0.6310344827586206, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Spanish", + "score": 0.3428571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Russian", + "score": 0.24285714285714283, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Arabic", + "score": 0.34285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Chinese", + "score": 0.3142857142857142, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "funqa_unexpected_action_humor_video", + "score": 0.39333333333333337, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "multi_lingual_manual_explanation_scooter_French", + "score": 0.36428571428571427, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Spanish", + "score": 0.3071428571428572, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "figurative_speech_explanation", + "score": 0.8137931034482758, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_English", + "score": 0.1857142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Russian", + "score": 0.22142857142857145, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Arabic", + "score": 0.29999999999999993, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "defeasible_reasoning", + "score": 0.8448275862068967, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Japanese", + "score": 0.33571428571428574, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "image_captioning_with_additional_requirements", + "score": 0.8285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_French", + "score": 0.2785714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "visualization_with_code", + "score": 0.5714285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "table_understanding_fetaqa", + "score": 0.47857142857142865, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_jailbreak", + "score": 0.8300000000000001, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Text Recognition (OCR)", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_celebrity", + "score": 0.8500000000000002, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "GUI_Chat_Easy", + "score": 0.6884615384615385, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_short_title", + "score": 0.6642857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "video_content_follow_up", + "score": 0.8214285714285715, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Planning and Decision Making" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "GUI_Chat_Hard", + "score": 0.3806451612903227, + "eval_type": "llm", + "num_demo": 1, + "num_query": 31, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_politics", + "score": 0.705, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "electrocardiogram", + "score": 0.24285714285714285, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "tweets_captioning", + "score": 0.5214285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_captcha", + "score": 0.10000000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_racial", + "score": 0.7850000000000004, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visualmisleading", + "score": 0.8789473684210528, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_worldclass", + "score": 0.15, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_expert", + "score": 0.32142857142857134, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video2notes", + "score": 0.7071428571428572, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "iq_test", + "score": 0.6482758620689654, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "unusual_images", + "score": 0.8689655172413793, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visual_order_B", + "score": 0.9, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "art_explanation", + "score": 0.7068965517241379, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visual_order_A", + "score": 0.9, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "bridge_strategies_advanced", + "score": 0.1642857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + } +] \ No newline at end of file diff --git a/static/eval_results/Default/Gemini_1.5_pro_002/summary_results.json b/static/eval_results/Default/Gemini_1.5_pro_002/summary_results.json new file mode 100644 index 0000000000000000000000000000000000000000..0710bf3e0c920cb7b8109b90e9bcbdfba2792418 --- /dev/null +++ b/static/eval_results/Default/Gemini_1.5_pro_002/summary_results.json @@ -0,0 +1,251 @@ +{ + "model_summary": { + "core": { + "num_eval_tasks": 440, + "num_eval_samples": 6539, + "macro_mean_score": 0.4822473962867704, + "micro_mean_score": 0.4764805563057179 + }, + "open": { + "num_eval_tasks": 65, + "num_eval_samples": 1163, + "macro_mean_score": 0.5858190649927173, + "micro_mean_score": 0.6104901117798793 + }, + "overall_score": 0.4955784031499121 + }, + "keyword_stats": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.5202055934299538 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.5017043129027509 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.5532599716027446 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.546753787203128 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.425969084163906 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.5751012914154264 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.6982330827067671 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.513647745999633 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.3845337030093212 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.23899503258223884 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.4625032188638111 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.4292353723689881 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.4869625906903554 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.5028718355967439 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.5584779204331461 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.55005349042813 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.4292127751495457 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.44896309957892694 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.44418591808616864 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.5146447350354234 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.4688623462674191 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.5580414823700747 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.5538255562099124 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.39066515495086923 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.5370278962809547 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.5034399620483027 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.5028718355967439 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.4885398161821004 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.45544217378728585 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.5421439953094952 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.3335324339429373 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.43465181771633377 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.5250631828331306 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.5821004797173627 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.5124355410095621 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.5722329455291694 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.41210885517904977 + } + } + } +} \ No newline at end of file diff --git a/static/eval_results/Default/Gemini_1.5_pro_002/task_results.json b/static/eval_results/Default/Gemini_1.5_pro_002/task_results.json new file mode 100644 index 0000000000000000000000000000000000000000..97f780571a4c423de0ecfd5d6157ba715845c8f2 --- /dev/null +++ b/static/eval_results/Default/Gemini_1.5_pro_002/task_results.json @@ -0,0 +1,7756 @@ +[ + { + "name": "GUI_Act_Mobile_swipe", + "score": 0.6199454600186646, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "vln_identify_robot", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "9-image or more" + }, + { + "name": "road_map_find_highway_between_two_place", + "score": 0.6470588235294118, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "dish_ingredient_match", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "code_translation_easy", + "score": 0.34523809523809523, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "planning_screenshot_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_cell_recognition", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "counting", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "vln_english_next_step", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "position_relationship", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "TV_show_retrieval_by_character", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "multiview_reasoning_camera_moving", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "booking_web_recommendation", + "score": 0.6772108843537415, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "music_sheet_note_count", + "score": 0.11764705882352941, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "music_sheet_author", + "score": 0.375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "font_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "video_eval_visual_pref", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "code_match_problem", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "Bongard_Problem", + "score": 0.15789473684210525, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "sign_language", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "Ad_count_detection", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_find_odd_one_out", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_translation_Python", + "score": 0.4583333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "2d_image_jigsaw_puzzle_easy", + "score": 0.21309523809523806, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "code_translation_advanced", + "score": 0.10714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "flowchart_code_generation", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "TRANCE_physics_reasoning_basic", + "score": 0.7647058823529411, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "logical_reasoning_fit_pattern", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "song_title_identification_from_lyrics", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "knowledge_sign_recognition", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "google_streetview_line_sorting", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "relative_depth_of_different_points", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "autonomous_driving_scene_analysis", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "google_streetview_circle_reasoning", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "geometry_reasoning_count_line_intersections", + "score": 0.39285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "media_QA_web_stackoverflow", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "vln_identify_location", + "score": 0.3939393939393939, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "llavaguard", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "knowledge_graph_understanding", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "image_translation_en2cn", + "score": 0.4119942575491687, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "CLEVRER_physics", + "score": 0.45, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "medical_content_based_retrieval_radiology", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "recover_masked_word_in_figure", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "photoshop_operation", + "score": 0.15952380952380954, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "media_homepage_profile", + "score": 0.3056838524883637, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "MMMU_pro_exam_screenshot", + "score": 0.29292929292929293, + "eval_type": "rule", + "num_demo": 1, + "num_query": 99, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "multilingual_news_qa", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "geometry_reasoning_overlapped_circle", + "score": 0.6071428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "TRANCE_physics_reasoning_event", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "medical_image_artifacts_indentification", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "rocks_samples_identify", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "monthly_weather_days_count", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Mobile_tap", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "paper_vqa", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "booking_web_rating", + "score": 0.8928571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "product_ocr_qa", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_solution_compare", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "video_grounding_spatial", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "ishihara_test", + "score": 0.31428571428571433, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "functionality_matching_in_different_objects", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "move_pos_to_pos_hanoi_4_pole", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "geometry_reasoning_grid", + "score": 0.8214285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_visual_storage", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "location_vqa", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_nested_squares", + "score": 0.32142857142857145, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_visual_barman", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "visual_correspondance_in_two_images", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "GUI_Act_Web_Multi", + "score": 0.4699566675933124, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_polyp_segmentation_single_object_rater", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "geometry_reasoning_circled_letter", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_translation_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "transit_map_intersection_points", + "score": 0.33035714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "mensa_iq_test", + "score": 0.4656862745098039, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_segments_reordering", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "structured_output", + "num_input": "video" + }, + { + "name": "game_platform_support_identification", + "score": 0.9642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "webpage_code_understanding", + "score": 0.7777777777777778, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "music_sheet_sentiment", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "google_streetview_direction_understanding", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "planning_visual_blocksworld", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "media_recommend_solutions_stackoverflow", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "healthcare_info_judgement", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Ethical and Safety Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "pokemon_3D_recognition", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "paper_review_rating", + "score": 0.7093310229186855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "numerical_data", + "num_input": "4-5 images" + }, + { + "name": "chess_find_legal_moves", + "score": 0.06762834530316385, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "play_go_capture_stone", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_visual_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "worldle", + "score": 0.4497384340940744, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "medical_multi_organ_segmentation_rater", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "medical_abdomen_MRI_organ_recognition", + "score": 0.29166666666666663, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "signage_navigation", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "circuit_diagram_understanding", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "polygon_interior_angles", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "topological_sort", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_keywords_based_retrieval_non_radiology", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "medical_abdomen_endscopy_organ_recognition", + "score": 0.35119047619047616, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "Forensic_Detection_of_different_images", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "ancient_map_understanding", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "planning_screenshot_grippers", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "paper_review_acceptance", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "counting_multi_image", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "rocks_samples_compare", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "mindmap_elements_parsing", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chinese_idiom_recognition", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "top_video_creator_identification", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "interpret_force_perspective_illusion", + "score": 0.8666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_storage", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_2D_views_of_3D_shapes", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "video_action_recognition", + "score": 0.8214285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "structured_output", + "num_input": "video" + }, + { + "name": "annoying_word_search", + "score": 0.0035714285714285718, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "google_streetview_line_reasoning", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "scibench_calculus_wo_solution", + "score": 0.3877551020408163, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "vln_tegulu_next_step", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "planning_screenshot_tyreworld", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_parasite_detection", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "stock_info_parsing", + "score": 0.8025210084033615, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "mahjong", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "stock_price_future_prediction", + "score": 0.831857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_visualization_output_understanding", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 10, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "google_streetview_circle_sorting", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "chess_sygyzy_endgames", + "score": 0.09619047619047619, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "clevrer_moving_direction_video", + "score": 0.1875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "Movie_retrieval_by_actor", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "semantic_matching_of_two_images", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "rebus", + "score": 0.391304347826087, + "eval_type": "rule", + "num_demo": 1, + "num_query": 23, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "TRANCE_physics_reasoning_view", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "geometry_length", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "video_camera_motion_description", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "exact_text", + "num_input": "video" + }, + { + "name": "ascii_art_understanding", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "3d_fragments_understanding", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "entertainment_web_game_style", + "score": 0.8214285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Web_Single", + "score": 0.03864007436439077, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "vizwiz_quality_accessment_for_blind", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "comic_page_ordering", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "traffic_future_prediction_from_line_plot", + "score": 0.6723157894736841, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_biology", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "LaTeX_complex_formula_convertion", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "icon_arithmetic_puzzle", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_retrieval_given_surgeon_activity", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Videos", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "scibench_fundamental_wo_solution", + "score": 0.30612244897959184, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "medical_blood_vessels_recognition", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_grounding_temporal", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "planning_visual_grippers", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "muma_theory_of_mind_belief_of_goal", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "video_eval_dynamic_pref", + "score": 0.8125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "ti_fused_vqa_physics", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "orchestra_score_recognition", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "coco_ood_global_image_retrieval_by_query_property", + "score": 0.6726190476190476, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "web_action_grounding", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "animal_pose_estimation", + "score": 0.30454267975765786, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "multilingual_movie_info_parsing", + "score": 0.5510204081632654, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_intent_recognition", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "weather_info_retrieval", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "logical_reasoning_2d_folding", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "cheapest_flight_identification", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "calendar_schedule_suggestion", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "average_humidity_estimate_plot", + "score": 0.668, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "electricity_load_estimate_plot", + "score": 0.7153571428571428, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "web_action_prediction", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "pmc_vqa_medical_image_qa", + "score": 0.631578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "highest_discount_game_price_identification", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "vlnqa_egocentric_navigation_video", + "score": 0.4375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "music_info_parsing", + "score": 0.48214285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "vln_hindi_next_step", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "remaining_playback_time_calculation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "music_sheet_name", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "video_eval_factual_pref", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "video_to_camera_trajectory_retrieval", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "code_programming_test_hard", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "code_retrieval", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "nlvr2_two_image_compare_qa", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "movie_info_parsing", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "weather_map_climate_type_temperature_parsing", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_transformation", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_execution", + "score": 0.4375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "music_sheet_format_QA", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "multilingual_game_info_parsing", + "score": 0.48214285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "egocentric_analysis_single_image", + "score": 0.7777777777777778, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_programming_test_easy", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 24, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "landmark_recognition_and_qa", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_visual_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "planning_screenshot_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "actor_recognition_in_Movie", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "extract_webpage_headline", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "action_sequence", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "9-image or more" + }, + { + "name": "recipe_image_ordering", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "hashtag_recommendation", + "score": 0.9523809523809523, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_connectivity", + "score": 0.8666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "distinguish_ai_generated_image", + "score": 0.6842105263157895, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "relative_reflectance_of_different_regions", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_counting_lymphocytes", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "youtube_video_info_parsing", + "score": 0.5238095238095237, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "physical_property_reasoning", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "av_human_multiview_counting", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "6-8 images" + }, + { + "name": "planning_screenshot_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "perception_test_object_shuffle_video", + "score": 0.4375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "insect_order_classification", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "av_vehicle_multiview_counting", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "6-8 images" + }, + { + "name": "planning_screenshot_blocksworld", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "clevrer_object_existence_video", + "score": 0.4375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "soccer_offside", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "face_identity_matching", + "score": 0.8666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "4-5 images" + }, + { + "name": "arc_agi", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "famous_building_recognition", + "score": 0.84375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_shortest_path_kamada_kawai", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "IAM_line_ocr_and_locate", + "score": 0.7099528290771637, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "license_plate_recognition", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "emotion_recognition", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "6-8 images" + }, + { + "name": "places365_similar_scene_retrieval", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "waldo", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "3d_indoor_scene_text_bbox_prediction", + "score": 0.14711083476825218, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "number_comparison", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "top_rated_hotel_identification", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "TV_show_info_parsing", + "score": 0.7460317460317462, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "science_basic_physics", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "image_style_recognition", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "widerface_face_count_and_event_classification", + "score": 0.4642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "multi_load_type_prediction_from_plot", + "score": 0.5595238095238094, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "face_keypoint_detection", + "score": 0.6758816417011395, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "math_parity", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "music_info_retrieval", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "snli_ve_visual_entailment", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "graph_theory", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "figureqa", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "human_relationship_reasoning", + "score": 0.9375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Commonsense and Social Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "brand_logo_recognition_and_elaboration", + "score": 0.82, + "eval_type": "rule", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "electricity_plot_future_prediction", + "score": 0.8486368421052632, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "chess_winner_identification", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "game_info_retrieval", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "logo2k_same_type_logo_retrieval", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "algebra", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "clevr_arithmetic", + "score": 0.3684210526315789, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "mnist_pattern", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "super_clevr", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "visualdial_visual_dialog_image_guessing", + "score": 0.8666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "graph_isomorphism", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "science_molecule_chemistry", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "clevrer_video_moving_object_count", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 21, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "video" + }, + { + "name": "landmark_check_two_images", + "score": 0.8222222222222222, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "math_convexity_value_estimation", + "score": 0.555696767990635, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_chemistry", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "map_diagram_qa", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "muma_theory_of_mind_social_goal", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "deciphering_oracle_bone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "graph_shortest_path_planar", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "funsd_document_qa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "graph_hamiltonian_path", + "score": 0.2482142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "iconqa", + "score": 0.42105263157894735, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "newspaper_page_parse_and_count", + "score": 0.4666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "electricity_future_prediction_from_table", + "score": 0.7089473684210525, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "humor_understand_caption_match", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "weather_info_parsing", + "score": 0.8174603174603176, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "vibe_eval_short_phrase", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "hotel_booking_confirmation_parsing", + "score": 0.3857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "signboard_identification", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "game_info_parsing", + "score": 0.8246753246753247, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "perception_test_video_action_count", + "score": 0.5625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "video" + }, + { + "name": "sta_action_localization_video", + "score": 0.375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "cam_traj_to_video_selection", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "geometry_descriptive", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "coco_person_detection", + "score": 0.66869355335515, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "star_object_interaction_video", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "clevrer_video_moving_object_property_recognition", + "score": 0.625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "chess_puzzle_single_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "movie_info_retrieval", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "video_content_reasoning", + "score": 0.7777777777777778, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "geometry_analytic", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "exchange_rate_estimate_plot", + "score": 0.9712357142857144, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "long_string_number_recognition", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "waybill_number_sequence_extraction", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "graph_maxflow", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "egocentric_spatial_reasoning", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "graph_chordless_cycle", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "places365_scene_type_classification", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_error_line_identification", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "research_website_parsing_blogpost", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_publication", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_homepage", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "reward_models_I2T_reward", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "av_multicamera_tracking_predict_bbox", + "score": 0.016738273048656067, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "9-image or more" + }, + { + "name": "single_person_pose_estimation", + "score": 0.32509082865144884, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "reward_models_T2I_reward", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "mvsa_sentiment_classification", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "photo_sharing_image_retrieval", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "nextqa_mc", + "score": 0.8421052631578947, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "math_breakpoint", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "ili_ratio_future_prediction", + "score": 0.2693571428571428, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "quizlet_question_solving", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "dvqa", + "score": 0.7368421052631579, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "graph_hamiltonian_cycle", + "score": 0.3803571428571428, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "constrained_generation_contain_contain_images", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "chart_vqa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geometry_solid", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_repeat_length", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "stock_info_retrieval", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "chess_puzzles_crushing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "long_string_letter_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "table_understanding_fact_verification", + "score": 0.8333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "constrained_generation_multi_contain_repeat_position_only_length", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "question_solution_solving", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "table_understanding_complex_question_answering", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_length", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_zoom", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_word", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "constrained_generation_contain_position_images", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "app_interactive_operations_tiktok", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "number_puzzle_sudoku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_position_length", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "crossword_mini_5x5", + "score": 0.7357142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "action_prediction", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "constrained_generation_xor_images", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "number_puzzle_kakuro_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_chinese_food_img2en", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_skribbl_io", + "score": 0.35, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "constrained_generation_multi_contain_repeat", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "ball_cup_swap_3", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "pictionary_genai_output_chinese", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "autorater_3d_model_texturing", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Metrics", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "pictionary_doodle_guess", + "score": 0.8666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_contain_length", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "pictionary_cartoon_drawing_guess", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "autorater_aesthetics", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "geometry_area", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_swap", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "autorater_artifact_reason", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_multi_contain_position_only", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "MFC_Bench_check_veracity", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "panel_images_single_question", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "autorater_subject", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "MFC_Bench_check_out_of_context", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "panel_images_multi_question", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_background_change", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "av_view_identification", + "score": 0.2777777777777778, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "chess_puzzles_checkmate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_clip_stable_diffusion_generate", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "autorater_motion_guided_editing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "newspaper_ocr_in_query_box", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_style", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "autorater_unmask", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "chess_puzzles_equality", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_entity_replace", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ocr_resume_skill_plain", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "autorater_semantics", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "MFC_Bench_check_face_attribute_edit", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ocr_math_equation", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_leetcode", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "autorater_mask", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "app_interactive_operations_instagram", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "code_programming_test_advanced", + "score": 0.05555555555555555, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "ocr_table_to_latex", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "autorater_control", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "app_interactive_operations_iphone_settings", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "ocr_resume_experience_plain", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "autorater_artifact", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_ppt", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "ocr_resume_employer_plain", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_notes", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "counterfactual_arithmetic", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "ocr_math_text_latex", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_amazon", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "poetry_acrostic_alliteration", + "score": 0.4, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_markdown", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_excel", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "ocr_resume_school_plain", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "poetry_shakespearean_sonnet", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_youtube", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "ocr_table_to_csv", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "poetry_haiku", + "score": 0.8666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_twitter", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "memorization_indian_celebrity", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_alipay", + "score": 0.4117647058823529, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "code_programming_extremely_hard", + "score": 0.0625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Planning and Decision Making" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "poetry_petrarchian_sonnet_optional_meter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "memorization_chinese_celebrity", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_alipay", + "score": 0.35294117647058826, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic", + "score": 0.8, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "memorization_famous_treaty", + "score": 0.75, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_zoom", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_math", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "poetry_limerick", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "memorization_papers", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_word", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "poetry_custom_rhyming_scheme", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_tiktok", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "shape_composition_shapes", + "score": 0.36734693877551017, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_math_MATH", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "maze_2d_8x8", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "shape_composition_colours", + "score": 0.40232426303854874, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_article_authors", + "score": 0.6071428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_math_TheoremQA", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MMSoc_HatefulMemes", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_html", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_PolitiFact", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ocr_article_journal", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_GossipCop", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Object Recognition and Classification", + "Scene and Event Understanding", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Memotion", + "score": 0.6117647058823531, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_leetcode", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_instagram", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_iphone_settings", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "perception_test_video_character_order", + "score": 0.9375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "app_layout_understanding_ppt", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "coco_object_detection_by_query_property", + "score": 0.5945319390969315, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_amazon", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_excel", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_youtube", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_twitter", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "video_motion_matching_3D_real", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "video_motion_matching_real_3D", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "visual_prediction_rater_depth_estimation", + "score": 0.6666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_plane_segmentation", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_openable_part_segmentation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_panoptic_segmentation", + "score": 0.49999999999999994, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_surface_normal_estimation", + "score": 0.8095238095238094, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_3d_assembled_quality_understanding", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "visual_prediction_rater_novel_view_synthesis", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_semantic_segmentation", + "score": 0.5416666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "cvbench_adapted_cvbench_relation", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_distance", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_depth", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_count", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "symbolic_graphics_programs_scalable_vector_graphics", + "score": 0.16666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "symbolic_graphics_programs_computer_aided_design", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_africa", + "score": 0.4142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_europe", + "score": 0.5142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_asia", + "score": 0.6285714285714287, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_americas", + "score": 0.3714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "geographic_remote_sensing_land_cover", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "cultural_vqa", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "character_recognition_in_TV_shows", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_output_result", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "kvqa_knowledge_aware_qa", + "score": 0.3157894736842105, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "tqa_textbook_qa", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "arxiv_vqa", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "painting_QA", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "realworld_qa_en2cn", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MMMU_physics_chemistry_MCQ", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_add_tag", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "handwritten_math_expression_extraction", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "3d_indoor_scene_text_bbox_selection", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_notes", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "table_understanding", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "generated_video_artifacts", + "score": 0.36875, + "eval_type": "llm", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "ascii_art_30", + "score": 0.14285714285714285, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "funny_image_title", + "score": 0.5714285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "table2latex_complex", + "score": 0.7111111111111111, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "meme_explain", + "score": 0.9, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "wikihow_complex_task_completion", + "score": 0.788888888888889, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "9-image or more" + }, + { + "name": "sceneqa_scene_transition_video", + "score": 0.35714285714285715, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "video_summary", + "score": 0.7, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "funqa_unexpected_action_magic_video", + "score": 0.5666666666666667, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "video_detail_description", + "score": 0.594736842105263, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "paper_review_writing", + "score": 0.6133333333333333, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "4-5 images" + }, + { + "name": "funqa_unexpected_action_creative_video", + "score": 0.41999999999999993, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "guess_image_generation_prompt", + "score": 0.8263157894736843, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "traffic_accident_analysis", + "score": 0.5357142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "activitynetqa", + "score": 0.4473684210526316, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "scibench_w_solution_open_ended", + "score": 0.48, + "eval_type": "llm", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "vibe-eval", + "score": 0.692857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_qa", + "score": 0.835714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "image_humor_understanding", + "score": 0.8896551724137929, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "doc_vqa", + "score": 0.80625, + "eval_type": "llm", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "4-5 images" + }, + { + "name": "docci_image_description_long", + "score": 0.7571428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "humor_explanation", + "score": 0.8, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "nextqa_oe", + "score": 0.35263157894736835, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "graph_interpretation", + "score": 0.8241379310344826, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "science_figure_explanation", + "score": 0.8206896551724137, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "ocrqa", + "score": 0.8103448275862067, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_content_follow_up", + "score": 0.8500000000000002, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Planning and Decision Making" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "GUI_Chat_Hard", + "score": 0.42580645161290337, + "eval_type": "llm", + "num_demo": 1, + "num_query": 31, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "iq_test", + "score": 0.6, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "unusual_images", + "score": 0.8758620689655172, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "tweets_captioning", + "score": 0.5714285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "art_explanation", + "score": 0.6655172413793102, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_short_title", + "score": 0.6357142857142856, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "bar_chart_interpretation", + "score": 0.6000000000000001, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funqa_unexpected_action_humor_video", + "score": 0.3933333333333333, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "figurative_speech_explanation", + "score": 0.8137931034482755, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "defeasible_reasoning", + "score": 0.7965517241379312, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "image_captioning_with_additional_requirements", + "score": 0.8857142857142859, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "visualization_with_code", + "score": 0.5142857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "table_understanding_fetaqa", + "score": 0.5285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_jailbreak", + "score": 0.82, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Text Recognition (OCR)", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_celebrity", + "score": 0.7750000000000001, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_captcha", + "score": 0.10000000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visual_order_B", + "score": 0.905263157894737, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "video2notes", + "score": 0.7142857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "red_teaming_politics", + "score": 0.74, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "electrocardiogram", + "score": 0.2928571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visual_order_A", + "score": 0.905263157894737, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "GUI_Chat_Easy", + "score": 0.6346153846153848, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Spanish", + "score": 0.4, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Russian", + "score": 0.48571428571428577, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "red_teaming_racial", + "score": 0.765, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Arabic", + "score": 0.42142857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "red_teaming_visualmisleading", + "score": 0.8842105263157897, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Chinese", + "score": 0.6428571428571429, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "bridge_strategies_worldclass", + "score": 0.21428571428571433, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_manual_explanation_scooter_French", + "score": 0.45000000000000007, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Spanish", + "score": 0.35000000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_advanced", + "score": 0.22142857142857145, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_English", + "score": 0.15000000000000005, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_expert", + "score": 0.3428571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Russian", + "score": 0.22142857142857147, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Japanese", + "score": 0.3000000000000001, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Arabic", + "score": 0.23571428571428577, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_French", + "score": 0.33571428571428574, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + } +] \ No newline at end of file diff --git a/static/eval_results/Default/Idefics3/summary_results.json b/static/eval_results/Default/Idefics3/summary_results.json new file mode 100644 index 0000000000000000000000000000000000000000..ce15d5ce2e9339df0bcb8985694132ef9048c00a --- /dev/null +++ b/static/eval_results/Default/Idefics3/summary_results.json @@ -0,0 +1,251 @@ +{ + "model_summary": { + "core": { + "num_eval_tasks": 440, + "num_eval_samples": 6539, + "macro_mean_score": 0.08956972487602757, + "micro_mean_score": 0.08982225274252693 + }, + "open": { + "num_eval_tasks": 65, + "num_eval_samples": 1163, + "macro_mean_score": 0.3210866162255635, + "micro_mean_score": 0.35649183147033553 + }, + "overall_score": 0.11936892871309657 + }, + "keyword_stats": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.123378776179585 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.09602065544451607 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.1661543932339007 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.13018902877020821 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.11200133210641629 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.1837120314657304 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.2364085213032582 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.15239546294916975 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.08255834173646705 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.03149369112824262 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.06151607584357764 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.10124344675801887 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.14147248511867794 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.15942387460900312 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.17458268378399872 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.13442937440893113 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.02766884416043467 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.15513016850044997 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.03757596375966502 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.05386631116442094 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.0760949224506388 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.2987797010800956 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.10403841600436024 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.0661753590325019 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.09190674791720088 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.12345439179884048 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.15942387460900312 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.11382786944230487 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.10803808254834846 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.11450308988278819 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.04671278220005028 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.0978814644137225 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.13283830731528018 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.09697463995668018 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.1840497279921703 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.1605667124060194 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.09835465288235297 + } + } + } +} \ No newline at end of file diff --git a/static/eval_results/Default/Idefics3/task_results.json b/static/eval_results/Default/Idefics3/task_results.json new file mode 100644 index 0000000000000000000000000000000000000000..459293d139ef01abb92c3847e7553a82abf43617 --- /dev/null +++ b/static/eval_results/Default/Idefics3/task_results.json @@ -0,0 +1,7756 @@ +[ + { + "name": "face_identity_matching", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "4-5 images" + }, + { + "name": "IAM_line_ocr_and_locate", + "score": 0.09799690552820609, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "science_molecule_chemistry", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "weather_info_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "signboard_identification", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "av_multicamera_tracking_predict_bbox", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "9-image or more" + }, + { + "name": "funsd_document_qa", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "physical_property_reasoning", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "geometry_area", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geometry_analytic", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "map_diagram_qa", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "clevr_arithmetic", + "score": 0.05263157894736842, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "graph_connectivity", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "visualdial_visual_dialog_image_guessing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "graph_isomorphism", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "clevrer_object_existence_video", + "score": 0.375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "question_solution_solving", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "snli_ve_visual_entailment", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_chemistry", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ili_ratio_future_prediction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "image_style_recognition", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "multilingual_movie_info_parsing", + "score": 0.02040816326530612, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "star_object_interaction_video", + "score": 0.125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "video_content_reasoning", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "geometry_transformation", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "places365_similar_scene_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "vibe_eval_short_phrase", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "cam_traj_to_video_selection", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "math_breakpoint", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "photo_sharing_image_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "perception_test_video_character_order", + "score": 0.1875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "coco_ood_global_image_retrieval_by_query_property", + "score": 0.37186147186147184, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "arc_agi", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "vizwiz_quality_accessment_for_blind", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "av_human_multiview_counting", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "6-8 images" + }, + { + "name": "waybill_number_sequence_extraction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "emotion_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "6-8 images" + }, + { + "name": "brand_logo_recognition_and_elaboration", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "license_plate_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chess_puzzle_single_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_programming_test_easy", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 24, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_winner_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chart_vqa", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "math_convexity_value_estimation", + "score": 5.419227899761125e-10, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "3d_indoor_scene_text_bbox_prediction", + "score": 0.043853084084109095, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "long_string_letter_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "dvqa", + "score": 0.3157894736842105, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "egocentric_spatial_reasoning", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "music_info_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "quizlet_question_solving", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "coco_person_detection", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "clevrer_moving_direction_video", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "graph_maxflow", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "game_info_parsing", + "score": 0.045454545454545456, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_hamiltonian_cycle", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "av_vehicle_multiview_counting", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "6-8 images" + }, + { + "name": "animal_pose_estimation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "humor_understand_caption_match", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "super_clevr", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "average_humidity_estimate_plot", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "nlvr2_two_image_compare_qa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "sta_action_localization_video", + "score": 0.1875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "face_keypoint_detection", + "score": 0.6359252430381498, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "insect_order_classification", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "deciphering_oracle_bone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "places365_scene_type_classification", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "science_basic_physics", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "perception_test_object_shuffle_video", + "score": 0.0625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "long_string_number_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_programming_test_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "video_to_camera_trajectory_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "cheapest_flight_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "newspaper_ocr_in_query_box", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "stock_info_retrieval", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "graph_theory", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "landmark_recognition_and_qa", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "muma_theory_of_mind_social_goal", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "clevrer_video_moving_object_property_recognition", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "action_sequence", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "9-image or more" + }, + { + "name": "game_info_retrieval", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "av_view_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "code_programming_test_advanced", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "muma_theory_of_mind_belief_of_goal", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "code_programming_extremely_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Planning and Decision Making" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "geometry_solid", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "top_rated_hotel_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "stock_info_parsing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "pmc_vqa_medical_image_qa", + "score": 0.5789473684210527, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "3d_fragments_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "electricity_future_prediction_from_table", + "score": 0.09947368421052631, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "TV_show_info_parsing", + "score": 0.11904761904761907, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "coco_object_detection_by_query_property", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "vlnqa_egocentric_navigation_video", + "score": 0.1875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "landmark_check_two_images", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "figureqa", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_biology", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "traffic_future_prediction_from_line_plot", + "score": 0.022894736842105266, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "mvsa_sentiment_classification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "stock_price_future_prediction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multi_load_type_prediction_from_plot", + "score": 0.10714285714285712, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "hotel_booking_confirmation_parsing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_shortest_path_planar", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "action_prediction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "code_error_line_identification", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "math_parity", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "famous_building_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "multilingual_game_info_parsing", + "score": 0.017857142857142856, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "weather_info_parsing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_shortest_path_kamada_kawai", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "widerface_face_count_and_event_classification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "nextqa_mc", + "score": 0.6842105263157895, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "ti_fused_vqa_physics", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "movie_info_retrieval", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "graph_chordless_cycle", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "clevrer_video_moving_object_count", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 21, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "video" + }, + { + "name": "geometry_length", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "algebra", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "human_relationship_reasoning", + "score": 0.875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Commonsense and Social Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "exchange_rate_estimate_plot", + "score": 0.005857142857142854, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "electricity_load_estimate_plot", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "logo2k_same_type_logo_retrieval", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "single_person_pose_estimation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_math", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "electricity_plot_future_prediction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "graph_hamiltonian_path", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "iconqa", + "score": 0.47368421052631576, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "movie_info_parsing", + "score": 0.0625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "youtube_video_info_parsing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "egocentric_analysis_single_image", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "perception_test_video_action_count", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "video" + }, + { + "name": "music_info_parsing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "mnist_pattern", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "newspaper_page_parse_and_count", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_descriptive", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_relation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_depth", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_distance", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_count", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geographic_remote_sensing_land_cover", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "symbolic_graphics_programs_computer_aided_design", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "symbolic_graphics_programs_scalable_vector_graphics", + "score": 0.16666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_asia", + "score": 0.028571428571428574, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_americas", + "score": 0.09999999999999999, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_europe", + "score": 0.014285714285714287, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_africa", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "visual_prediction_rater_plane_segmentation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_novel_view_synthesis", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_3d_assembled_quality_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "visual_prediction_rater_semantic_segmentation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_panoptic_segmentation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_surface_normal_estimation", + "score": 0.23809523809523808, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_depth_estimation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_openable_part_segmentation", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "medical_abdomen_MRI_organ_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "visual_correspondance_in_two_images", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "google_streetview_circle_reasoning", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "vln_tegulu_next_step", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "flowchart_code_generation", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "video_eval_factual_pref", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "medical_polyp_segmentation_single_object_rater", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "recipe_image_ordering", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "healthcare_info_judgement", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Ethical and Safety Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "relative_depth_of_different_points", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "planning_visual_storage", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_visualization_output_understanding", + "score": 0.1, + "eval_type": "rule", + "num_demo": 1, + "num_query": 10, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "Ad_count_detection", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "TRANCE_physics_reasoning_view", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "video_grounding_temporal", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "chess_sygyzy_endgames", + "score": 0.018367346938775512, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_visual_blocksworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "web_action_grounding", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "position_relationship", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Web_Multi", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "webpage_code_understanding", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "vln_english_next_step", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "logical_reasoning_2d_folding", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "extract_webpage_headline", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_translation_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "music_sheet_note_count", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_circled_letter", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_find_odd_one_out", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "paper_vqa", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "medical_abdomen_endscopy_organ_recognition", + "score": 0.047619047619047616, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "worldle", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "medical_keywords_based_retrieval_non_radiology", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "remaining_playback_time_calculation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "sign_language", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "video_camera_motion_description", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "exact_text", + "num_input": "video" + }, + { + "name": "video_segments_reordering", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "structured_output", + "num_input": "video" + }, + { + "name": "interpret_force_perspective_illusion", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Web_Single", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "hashtag_recommendation", + "score": 0.5476190476190477, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_tyreworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_grounding_spatial", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "weather_map_climate_type_temperature_parsing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_match_problem", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "music_sheet_sentiment", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "scibench_calculus_wo_solution", + "score": 0.04081632653061224, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_eval_visual_pref", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "image_translation_en2cn", + "score": 0.06933814569716257, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "game_platform_support_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "music_sheet_format_QA", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "CLEVRER_physics", + "score": 0.05, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "location_vqa", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "semantic_matching_of_two_images", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "planning_visual_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "geometry_reasoning_nested_squares", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "circuit_diagram_understanding", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "scibench_fundamental_wo_solution", + "score": 0.02040816326530612, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_visual_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "planning_visual_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "code_retrieval", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "medical_cell_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_blocksworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_storage", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "distinguish_ai_generated_image", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_count_line_intersections", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_grippers", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "booking_web_recommendation", + "score": 0.36377551020408166, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "photoshop_operation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "llavaguard", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "media_homepage_profile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "vln_hindi_next_step", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "media_recommend_solutions_stackoverflow", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "dish_ingredient_match", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "medical_retrieval_given_surgeon_activity", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Videos", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "google_streetview_direction_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "music_sheet_name", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "code_translation_easy", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "ancient_map_understanding", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "mindmap_elements_parsing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "autonomous_driving_scene_analysis", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_2D_views_of_3D_shapes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "paper_review_acceptance", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "rocks_samples_compare", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "google_streetview_line_reasoning", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "orchestra_score_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "highest_discount_game_price_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "video_intent_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "counting", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "media_QA_web_stackoverflow", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_solution_compare", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "soccer_offside", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "2d_image_jigsaw_puzzle_easy", + "score": 0.017857142857142856, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "medical_content_based_retrieval_radiology", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "song_title_identification_from_lyrics", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MMMU_pro_exam_screenshot", + "score": 0.15151515151515152, + "eval_type": "rule", + "num_demo": 1, + "num_query": 99, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "multiview_reasoning_camera_moving", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "medical_multi_organ_segmentation_rater", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "knowledge_graph_understanding", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "rocks_samples_identify", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "mensa_iq_test", + "score": 0.18627450980392157, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "play_go_capture_stone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "medical_blood_vessels_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "booking_web_rating", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "web_action_prediction", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "top_video_creator_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_visual_grippers", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "annoying_word_search", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Mobile_tap", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "comic_page_ordering", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "geometry_reasoning_grid", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "TRANCE_physics_reasoning_event", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "planning_screenshot_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Mobile_swipe", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "recover_masked_word_in_figure", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "vln_identify_robot", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "9-image or more" + }, + { + "name": "multilingual_news_qa", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "move_pos_to_pos_hanoi_4_pole", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "relative_reflectance_of_different_regions", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "counting_multi_image", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "code_translation_advanced", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "medical_image_artifacts_indentification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "signage_navigation", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "transit_map_intersection_points", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_execution", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "icon_arithmetic_puzzle", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_overlapped_circle", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "google_streetview_circle_sorting", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "chinese_idiom_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "knowledge_sign_recognition", + "score": 0.1111111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "monthly_weather_days_count", + "score": 0.2380952380952381, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "calendar_schedule_suggestion", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "music_sheet_author", + "score": 0.3125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "ascii_art_understanding", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "chess_find_legal_moves", + "score": 0.0016611295681063123, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "topological_sort", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "entertainment_web_game_style", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_action_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "structured_output", + "num_input": "video" + }, + { + "name": "functionality_matching_in_different_objects", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "Movie_retrieval_by_actor", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "TV_show_retrieval_by_character", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "Forensic_Detection_of_different_images", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "polygon_interior_angles", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "pokemon_3D_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "mahjong", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "font_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "paper_review_rating", + "score": 0.3937189896097942, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "numerical_data", + "num_input": "4-5 images" + }, + { + "name": "number_comparison", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "medical_counting_lymphocytes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "actor_recognition_in_Movie", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "TRANCE_physics_reasoning_basic", + "score": 0.11764705882352941, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "waldo", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "google_streetview_line_sorting", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "video_eval_dynamic_pref", + "score": 0.1875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "product_ocr_qa", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "Bongard_Problem", + "score": 0.15789473684210525, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_translation_Python", + "score": 0.020833333333333332, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "logical_reasoning_fit_pattern", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "road_map_find_highway_between_two_place", + "score": 0.058823529411764705, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ishihara_test", + "score": 0.028571428571428574, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_parasite_detection", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "LaTeX_complex_formula_convertion", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "vln_identify_location", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "rebus", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 23, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "constrained_generation_multi_contain_repeat_position_only_length", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_multi_contain_repeat", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_position_length", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_xor_images", + "score": 0.8, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_position_images", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_repeat_length", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_contain_length", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_multi_contain_position_only", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_contain_images", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_length", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_equality", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_crushing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_checkmate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "panel_images_single_question", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "panel_images_multi_question", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_tiktok", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_notes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_alipay", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_amazon", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_instagram", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_ppt", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_leetcode", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_twitter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_zoom", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_iphone_settings", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_youtube", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_excel", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_word", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "crossword_mini_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MMSoc_HatefulMemes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_GossipCop", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Object Recognition and Classification", + "Scene and Event Understanding", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_PolitiFact", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Memotion", + "score": 0.07058823529411766, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "reward_models_I2T_reward", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "reward_models_T2I_reward", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "ocr_math_MATH", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "ocr_math_TheoremQA", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "counterfactual_arithmetic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_homepage", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_publication", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_blogpost", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "table_understanding_complex_question_answering", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "table_understanding_fact_verification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_swap", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_style", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_clip_stable_diffusion_generate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_veracity", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_attribute_edit", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_entity_replace", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_out_of_context", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_background_change", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "video_motion_matching_3D_real", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "video_motion_matching_real_3D", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "number_puzzle_kakuro_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "number_puzzle_sudoku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ball_cup_swap_3", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "maze_2d_8x8", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "shape_composition_shapes", + "score": 0.01020408163265306, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "shape_composition_colours", + "score": 0.05257936507936508, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_chinese_celebrity", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_indian_celebrity", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_papers", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_famous_treaty", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_csv", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_school_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_text_latex", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_skill_plain", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_experience_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_article_authors", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_markdown", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_html", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_employer_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_equation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_article_journal", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_latex", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "autorater_subject", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_artifact", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "autorater_control", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_aesthetics", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_mask", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_artifact_reason", + "score": 0.6, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "autorater_semantics", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_motion_guided_editing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "autorater_3d_model_texturing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Metrics", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_unmask", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "poetry_haiku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_limerick", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_petrarchian_sonnet_optional_meter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_custom_rhyming_scheme", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic_alliteration", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_shakespearean_sonnet", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "pictionary_cartoon_drawing_guess", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_doodle_guess", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_skribbl_io", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_genai_output_chinese", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_chinese_food_img2en", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_tiktok", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_alipay", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_amazon", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_instagram", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_ppt", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_leetcode", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_twitter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_zoom", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_iphone_settings", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_youtube", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_excel", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_word", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "painting_QA", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "character_recognition_in_TV_shows", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MMMU_physics_chemistry_MCQ", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cultural_vqa", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "realworld_qa_en2cn", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "arxiv_vqa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "code_add_tag", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "3d_indoor_scene_text_bbox_selection", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "table_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "tqa_textbook_qa", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_output_result", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "handwritten_math_expression_extraction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "kvqa_knowledge_aware_qa", + "score": 0.2631578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_notes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "scibench_w_solution_open_ended", + "score": 0.10800000000000001, + "eval_type": "llm", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "electrocardiogram", + "score": 0.3071428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funny_image_title", + "score": 0.22142857142857145, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "image_captioning_with_additional_requirements", + "score": 0.49999999999999994, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "meme_explain", + "score": 0.2928571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "image_humor_understanding", + "score": 0.5793103448275864, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "doc_vqa", + "score": 0.625, + "eval_type": "llm", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "4-5 images" + }, + { + "name": "generated_video_artifacts", + "score": 0.018750000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "sceneqa_scene_transition_video", + "score": 0.1642857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "nextqa_oe", + "score": 0.20526315789473684, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "tweets_captioning", + "score": 0.3571428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "defeasible_reasoning", + "score": 0.5931034482758621, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "vibe-eval", + "score": 0.33571428571428574, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "ocrqa", + "score": 0.603448275862069, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_content_follow_up", + "score": 0.34285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Planning and Decision Making" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "figurative_speech_explanation", + "score": 0.6827586206896553, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "wikihow_complex_task_completion", + "score": 0.3444444444444444, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "9-image or more" + }, + { + "name": "docci_image_description_long", + "score": 0.4357142857142856, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_detail_description", + "score": 0.26842105263157895, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "video2notes", + "score": 0.15, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "funqa_unexpected_action_magic_video", + "score": 0.26666666666666666, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "video_qa", + "score": 0.5714285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "bar_chart_interpretation", + "score": 0.2172413793103448, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_summary", + "score": 0.12142857142857146, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "iq_test", + "score": 0.23793103448275857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funqa_unexpected_action_creative_video", + "score": 0.1866666666666667, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "unusual_images", + "score": 0.5379310344827587, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "GUI_Chat_Hard", + "score": 0.47419354838709676, + "eval_type": "llm", + "num_demo": 1, + "num_query": 31, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "graph_interpretation", + "score": 0.38965517241379305, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "traffic_accident_analysis", + "score": 0.21428571428571427, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "humor_explanation", + "score": 0.22666666666666666, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funqa_unexpected_action_humor_video", + "score": 0.14666666666666667, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "video_short_title", + "score": 0.2857142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "GUI_Chat_Easy", + "score": 0.5192307692307693, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "table2latex_complex", + "score": 0.07777777777777778, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "activitynetqa", + "score": 0.21052631578947367, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "visualization_with_code", + "score": 0.07142857142857142, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "science_figure_explanation", + "score": 0.42068965517241386, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "guess_image_generation_prompt", + "score": 0.7631578947368421, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "art_explanation", + "score": 0.45862068965517244, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "paper_review_writing", + "score": 0.3466666666666667, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "4-5 images" + }, + { + "name": "bridge_strategies_worldclass", + "score": 0.049999999999999996, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_expert", + "score": 0.21428571428571427, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_advanced", + "score": 0.1285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Spanish", + "score": 0.0071428571428571435, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Arabic", + "score": 0.021428571428571432, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Japanese", + "score": 0.0071428571428571435, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Russian", + "score": 0.0071428571428571435, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_English", + "score": 0.0071428571428571435, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_French", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "table_understanding_fetaqa", + "score": 0.2, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visual_order_B", + "score": 0.9, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "red_teaming_racial", + "score": 0.6500000000000001, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_captcha", + "score": 0.10000000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_celebrity", + "score": 0.8800000000000002, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visual_order_A", + "score": 0.9, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "red_teaming_politics", + "score": 0.6800000000000002, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visualmisleading", + "score": 0.836842105263158, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_jailbreak", + "score": 0.265, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Text Recognition (OCR)", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Spanish", + "score": 0.14285714285714285, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Arabic", + "score": 0.16428571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Russian", + "score": 0.15, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_French", + "score": 0.2571428571428572, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Chinese", + "score": 0.13571428571428573, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "ascii_art_30", + "score": 0.2857142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + } +] \ No newline at end of file diff --git a/static/eval_results/Default/InternVL2_2B/summary_results.json b/static/eval_results/Default/InternVL2_2B/summary_results.json new file mode 100644 index 0000000000000000000000000000000000000000..0243043e9c2981fd3df05c1a2f24eb91964c05ea --- /dev/null +++ b/static/eval_results/Default/InternVL2_2B/summary_results.json @@ -0,0 +1,251 @@ +{ + "model_summary": { + "core": { + "num_eval_tasks": 440, + "num_eval_samples": 6539, + "macro_mean_score": 0.13141974398938763, + "micro_mean_score": 0.13063500716262516 + }, + "open": { + "num_eval_tasks": 65, + "num_eval_samples": 1163, + "macro_mean_score": 0.23864417043743646, + "micro_mean_score": 0.24901117798796224 + }, + "overall_score": 0.14522090778963154 + }, + "keyword_stats": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.14491178903291552 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.12126906675624163 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.16912754929321935 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.18542274192083463 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.13923308734553164 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.23992252224543772 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.3420927318295739 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.14807577209152425 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.13036555933925006 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.01727799227799228 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.057021136657850864 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.10504085961245285 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.1625198552182714 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.18999779001767986 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.1487677475708977 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.2011727338536935 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.11886936592818943 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.1131404778887607 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.05739750616837997 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.15465451663650032 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.16044698450090833 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.21429521387724249 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.2128614316540013 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.03658352229780801 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.05757839721254354 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.15225683687839608 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.18999779001767986 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.17677460549936644 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.158165588340436 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.08722661966805 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.04102853815875594 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.11264043251709285 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.17001758160301803 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.3332891958712894 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.1686125516807394 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.21169137106199268 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.10975764217070672 + } + } + } +} \ No newline at end of file diff --git a/static/eval_results/Default/InternVL2_2B/task_results.json b/static/eval_results/Default/InternVL2_2B/task_results.json new file mode 100644 index 0000000000000000000000000000000000000000..6c19e281eb17189df0b11341391798694f472783 --- /dev/null +++ b/static/eval_results/Default/InternVL2_2B/task_results.json @@ -0,0 +1,7756 @@ +[ + { + "name": "functionality_matching_in_different_objects", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "location_vqa", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "paper_review_rating", + "score": 0.6370339174257883, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "numerical_data", + "num_input": "4-5 images" + }, + { + "name": "scibench_calculus_wo_solution", + "score": 0.04081632653061224, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_find_odd_one_out", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "google_streetview_circle_sorting", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "planning_screenshot_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "relative_depth_of_different_points", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "recipe_image_ordering", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "rocks_samples_identify", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "sign_language", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "2d_image_jigsaw_puzzle_easy", + "score": 0.04047619047619048, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "booking_web_recommendation", + "score": 0.2936507936507936, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "signage_navigation", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "painting_QA", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "road_map_find_highway_between_two_place", + "score": 0.29411764705882354, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "video_eval_dynamic_pref", + "score": 0.3125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "logical_reasoning_fit_pattern", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "medical_keywords_based_retrieval_non_radiology", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "entertainment_web_game_style", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "game_platform_support_identification", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ascii_art_understanding", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "scibench_fundamental_wo_solution", + "score": 0.04081632653061224, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "vln_hindi_next_step", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "GUI_Act_Mobile_swipe", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_grounding_temporal", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "extract_webpage_headline", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "distinguish_ai_generated_image", + "score": 0.3684210526315789, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_nested_squares", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_cell_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "booking_web_rating", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "video_eval_visual_pref", + "score": 0.1875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "medical_parasite_detection", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "medical_image_artifacts_indentification", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_tyreworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_overlapped_circle", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "monthly_weather_days_count", + "score": 0.023809523809523808, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_visualization_output_understanding", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 10, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "CLEVRER_physics", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Mobile_tap", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "character_recognition_in_TV_shows", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "circuit_diagram_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "music_sheet_note_count", + "score": 0.058823529411764705, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "MMMU_physics_chemistry_MCQ", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chess_find_legal_moves", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_translation_advanced", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "planning_visual_storage", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "multiview_reasoning_camera_moving", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "medical_counting_lymphocytes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_visual_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "code_execution", + "score": 0.0625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_sygyzy_endgames", + "score": 0.010714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "video_action_recognition", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "structured_output", + "num_input": "video" + }, + { + "name": "cultural_vqa", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "music_sheet_author", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "rocks_samples_compare", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "comic_page_ordering", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "hashtag_recommendation", + "score": 0.9761904761904762, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "multilingual_news_qa", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "paper_review_acceptance", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "counting_multi_image", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "photoshop_operation", + "score": 0.02976190476190476, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "vln_identify_robot", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "9-image or more" + }, + { + "name": "geometry_reasoning_grid", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_visual_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "annoying_word_search", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "healthcare_info_judgement", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Ethical and Safety Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "webpage_code_understanding", + "score": 0.2222222222222222, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "font_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "relative_reflectance_of_different_regions", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "vln_tegulu_next_step", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "medical_abdomen_endscopy_organ_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "calendar_schedule_suggestion", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "mensa_iq_test", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "google_streetview_direction_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "planning_visual_blocksworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "medical_multi_organ_segmentation_rater", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "product_ocr_qa", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "counting", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_blocksworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "semantic_matching_of_two_images", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "MMMU_pro_exam_screenshot", + "score": 0.0707070707070707, + "eval_type": "rule", + "num_demo": 1, + "num_query": 99, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "chinese_idiom_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_visual_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "ishihara_test", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "pokemon_3D_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "Ad_count_detection", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "code_translation_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "visual_correspondance_in_two_images", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "video_eval_factual_pref", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "number_comparison", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "code_translation_Python", + "score": 0.0625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "song_title_identification_from_lyrics", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "paper_vqa", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_translation_easy", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "TRANCE_physics_reasoning_event", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "polygon_interior_angles", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "play_go_capture_stone", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "music_sheet_name", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "medical_content_based_retrieval_radiology", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "mindmap_elements_parsing", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_segments_reordering", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "structured_output", + "num_input": "video" + }, + { + "name": "planning_screenshot_storage", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "rebus", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 23, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "music_sheet_format_QA", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "autonomous_driving_scene_analysis", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "worldle", + "score": 0.0071428571428571435, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "transit_map_intersection_points", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "mahjong", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "Movie_retrieval_by_actor", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "music_sheet_sentiment", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "code_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "LaTeX_complex_formula_convertion", + "score": 0.11764705882352941, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_intent_recognition", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "GUI_Act_Web_Single", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "image_translation_en2cn", + "score": 0.20096524579696584, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "actor_recognition_in_Movie", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "medical_blood_vessels_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "remaining_playback_time_calculation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ancient_map_understanding", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "media_recommend_solutions_stackoverflow", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "vln_english_next_step", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "waldo", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "medical_polyp_segmentation_single_object_rater", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "planning_screenshot_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "realworld_qa_en2cn", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "soccer_offside", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "code_match_problem", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "recover_masked_word_in_figure", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "top_video_creator_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "web_action_grounding", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "code_solution_compare", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "arxiv_vqa", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "interpret_force_perspective_illusion", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "orchestra_score_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_grounding_spatial", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "logical_reasoning_2d_folding", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "GUI_Act_Web_Multi", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "llavaguard", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "google_streetview_circle_reasoning", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "TV_show_retrieval_by_character", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "icon_arithmetic_puzzle", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "vln_identify_location", + "score": 0.006060606060606061, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "planning_visual_grippers", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "geometry_reasoning_count_line_intersections", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_abdomen_MRI_organ_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "web_action_prediction", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "knowledge_graph_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "knowledge_sign_recognition", + "score": 0.1111111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "Forensic_Detection_of_different_images", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "position_relationship", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "Bongard_Problem", + "score": 0.18421052631578946, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_circled_letter", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_2D_views_of_3D_shapes", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "dish_ingredient_match", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "media_homepage_profile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "video_camera_motion_description", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "exact_text", + "num_input": "video" + }, + { + "name": "google_streetview_line_sorting", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "move_pos_to_pos_hanoi_4_pole", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "flowchart_code_generation", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "topological_sort", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "google_streetview_line_reasoning", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "TRANCE_physics_reasoning_view", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "planning_screenshot_grippers", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_retrieval_given_surgeon_activity", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Videos", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "TRANCE_physics_reasoning_basic", + "score": 0.23529411764705882, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "code_add_tag", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "weather_map_climate_type_temperature_parsing", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "highest_discount_game_price_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "media_QA_web_stackoverflow", + "score": 0.2380952380952381, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "newspaper_ocr_in_query_box", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "electricity_plot_future_prediction", + "score": 0.4296473684210526, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "coco_person_detection", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geometry_length", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "weather_info_parsing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "av_view_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "physical_property_reasoning", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "face_keypoint_detection", + "score": 0.848854419078294, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_maxflow", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "av_vehicle_multiview_counting", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "6-8 images" + }, + { + "name": "geometry_transformation", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "chess_puzzle_single_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ili_ratio_future_prediction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geometry_analytic", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "mvsa_sentiment_classification", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "graph_theory", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "clevrer_video_moving_object_property_recognition", + "score": 0.75, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "clevrer_video_moving_object_count", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 21, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "video" + }, + { + "name": "sta_action_localization_video", + "score": 0.0625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "code_programming_test_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "graph_hamiltonian_cycle", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "3d_indoor_scene_text_bbox_selection", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "vizwiz_quality_accessment_for_blind", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "funsd_document_qa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "single_person_pose_estimation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "deciphering_oracle_bone", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "license_plate_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "science_molecule_chemistry", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "snli_ve_visual_entailment", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "table_understanding", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geometry_solid", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "tqa_textbook_qa", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "music_info_parsing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_area", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "landmark_check_two_images", + "score": 0.24444444444444446, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "dvqa", + "score": 0.5263157894736842, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "cheapest_flight_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "code_programming_extremely_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Planning and Decision Making" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "widerface_face_count_and_event_classification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "movie_info_retrieval", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "long_string_letter_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chess_winner_identification", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "math_convexity_value_estimation", + "score": 0.3227898751277714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "multi_load_type_prediction_from_plot", + "score": 0.10714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "ti_fused_vqa_math", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "human_relationship_reasoning", + "score": 0.1875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Commonsense and Social Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "vibe_eval_short_phrase", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "av_multicamera_tracking_predict_bbox", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "9-image or more" + }, + { + "name": "perception_test_video_character_order", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "question_solution_solving", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "game_info_parsing", + "score": 0.2597402597402597, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "top_rated_hotel_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "nextqa_mc", + "score": 0.8947368421052632, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "stock_price_future_prediction", + "score": 0.26914285714285713, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "algebra", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "graph_shortest_path_planar", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "action_sequence", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "9-image or more" + }, + { + "name": "map_diagram_qa", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "places365_similar_scene_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "graph_shortest_path_kamada_kawai", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "star_object_interaction_video", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "graph_chordless_cycle", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "waybill_number_sequence_extraction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "iconqa", + "score": 0.10526315789473684, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "graph_connectivity", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "mnist_pattern", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "clevrer_moving_direction_video", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "egocentric_analysis_single_image", + "score": 0.1111111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "image_style_recognition", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "weather_info_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "geometry_descriptive", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "IAM_line_ocr_and_locate", + "score": 0.057101180112838316, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "electricity_load_estimate_plot", + "score": 0.2217857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "insect_order_classification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "perception_test_object_shuffle_video", + "score": 0.0625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "muma_theory_of_mind_social_goal", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "ti_fused_vqa_biology", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "youtube_video_info_parsing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "muma_theory_of_mind_belief_of_goal", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "quizlet_question_solving", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_chemistry", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "clevr_arithmetic", + "score": 0.15789473684210525, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "exchange_rate_estimate_plot", + "score": 0.8666499999999999, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "multilingual_movie_info_parsing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "hotel_booking_confirmation_parsing", + "score": 0.12857142857142856, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_programming_test_easy", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 24, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "brand_logo_recognition_and_elaboration", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "arc_agi", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "movie_info_parsing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "3d_indoor_scene_text_bbox_prediction", + "score": 0.0008403361344537821, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "video_to_camera_trajectory_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "TV_show_info_parsing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "stock_info_parsing", + "score": 0.34033613445378147, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "math_parity", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "visualdial_visual_dialog_image_guessing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "famous_building_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "egocentric_spatial_reasoning", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "nlvr2_two_image_compare_qa", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "vlnqa_egocentric_navigation_video", + "score": 0.125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "graph_isomorphism", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "code_output_result", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "landmark_recognition_and_qa", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "face_identity_matching", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "4-5 images" + }, + { + "name": "music_info_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "clevrer_object_existence_video", + "score": 0.5625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "cam_traj_to_video_selection", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "video_content_reasoning", + "score": 0.1111111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "3d_fragments_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "handwritten_math_expression_extraction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_physics", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "pmc_vqa_medical_image_qa", + "score": 0.21052631578947367, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "long_string_number_recognition", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "electricity_future_prediction_from_table", + "score": 0.48157894736842105, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "places365_scene_type_classification", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_error_line_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "code_programming_test_advanced", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "super_clevr", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "animal_pose_estimation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "logo2k_same_type_logo_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "science_basic_physics", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "kvqa_knowledge_aware_qa", + "score": 0.10526315789473684, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "math_breakpoint", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "newspaper_page_parse_and_count", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "multilingual_game_info_parsing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "signboard_identification", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "chart_vqa", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "action_prediction", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "average_humidity_estimate_plot", + "score": 0.5879999999999997, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "photo_sharing_image_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "humor_understand_caption_match", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "game_info_retrieval", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "emotion_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "6-8 images" + }, + { + "name": "figureqa", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "av_human_multiview_counting", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "6-8 images" + }, + { + "name": "perception_test_video_action_count", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "video" + }, + { + "name": "stock_info_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "coco_object_detection_by_query_property", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "graph_hamiltonian_path", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "coco_ood_global_image_retrieval_by_query_property", + "score": 0.2861111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "traffic_future_prediction_from_line_plot", + "score": 0.513578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_crushing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_equality", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_checkmate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_papers", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_indian_celebrity", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_chinese_celebrity", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_famous_treaty", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "reward_models_T2I_reward", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "reward_models_I2T_reward", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "crossword_mini_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_motion_matching_real_3D", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "video_motion_matching_3D_real", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "number_puzzle_sudoku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "number_puzzle_kakuro_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "panel_images_single_question", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "panel_images_multi_question", + "score": 0.047619047619047616, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "table_understanding_complex_question_answering", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "table_understanding_fact_verification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_position_images", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_xor_images", + "score": 0.8, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_repeat_length", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_contain_length", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_multi_contain_position_only", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_length", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_contain_images", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_multi_contain_repeat", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_position_length", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_multi_contain_repeat_position_only_length", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "maze_2d_8x8", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_PolitiFact", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_GossipCop", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Object Recognition and Classification", + "Scene and Event Understanding", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_HatefulMemes", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Memotion", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_latex", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_csv", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_markdown", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_experience_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_employer_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_article_authors", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_article_journal", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_skill_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_text_latex", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_equation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_html", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_school_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "pictionary_doodle_guess", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_chinese_food_img2en", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_cartoon_drawing_guess", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_genai_output_chinese", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_skribbl_io", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "counterfactual_arithmetic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_youtube", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_iphone_settings", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_notes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_word", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_alipay", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_twitter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_leetcode", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_tiktok", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_excel", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_instagram", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_ppt", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_zoom", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_amazon", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_layout_understanding_youtube", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_iphone_settings", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_notes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_word", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_alipay", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_twitter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_leetcode", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_tiktok", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_excel", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_instagram", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_ppt", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_zoom", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_amazon", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_attribute_edit", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_out_of_context", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_background_change", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_entity_replace", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_clip_stable_diffusion_generate", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_style", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_veracity", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_swap", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_blogpost", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_homepage", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_publication", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "autorater_3d_model_texturing", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Metrics", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_subject", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_unmask", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_artifact_reason", + "score": 0.6, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "autorater_mask", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_aesthetics", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_motion_guided_editing", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "autorater_artifact", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "autorater_control", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_semantics", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "shape_composition_shapes", + "score": 0.10076530612244897, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "shape_composition_colours", + "score": 0.08602607709750568, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_math_TheoremQA", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_MATH", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "ball_cup_swap_3", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "poetry_petrarchian_sonnet_optional_meter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_shakespearean_sonnet", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_custom_rhyming_scheme", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_haiku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic_alliteration", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_limerick", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "geographic_remote_sensing_land_cover", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "visual_prediction_rater_novel_view_synthesis", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_semantic_segmentation", + "score": 0.041666666666666664, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_panoptic_segmentation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_surface_normal_estimation", + "score": 0.11904761904761904, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_openable_part_segmentation", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_plane_segmentation", + "score": 0.17777777777777778, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_depth_estimation", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_3d_assembled_quality_understanding", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "symbolic_graphics_programs_computer_aided_design", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "symbolic_graphics_programs_scalable_vector_graphics", + "score": 0.1111111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_depth", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_relation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_distance", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_count", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_africa", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_asia", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_americas", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_europe", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "figurative_speech_explanation", + "score": 0.23103448275862065, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "tweets_captioning", + "score": 0.25000000000000006, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "science_figure_explanation", + "score": 0.11379310344827588, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "iq_test", + "score": 0.03793103448275863, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "sceneqa_scene_transition_video", + "score": 0.2214285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "wikihow_complex_task_completion", + "score": 0.13333333333333336, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "9-image or more" + }, + { + "name": "funqa_unexpected_action_humor_video", + "score": 0.24000000000000005, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "image_captioning_with_additional_requirements", + "score": 0.4, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "visualization_with_code", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocrqa", + "score": 0.3551724137931033, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "activitynetqa", + "score": 0.3368421052631579, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "generated_video_artifacts", + "score": 0.16874999999999998, + "eval_type": "llm", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "paper_review_writing", + "score": 0.29333333333333333, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "4-5 images" + }, + { + "name": "table2latex_complex", + "score": 0.2777777777777778, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "funqa_unexpected_action_creative_video", + "score": 0.15333333333333335, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "graph_interpretation", + "score": 0.18620689655172415, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "defeasible_reasoning", + "score": 0.2551724137931034, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "image_humor_understanding", + "score": 0.3379310344827585, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "doc_vqa", + "score": 0.5687500000000001, + "eval_type": "llm", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "4-5 images" + }, + { + "name": "video2notes", + "score": 0.05714285714285715, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "bar_chart_interpretation", + "score": 0.18275862068965518, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_short_title", + "score": 0.11428571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "humor_explanation", + "score": 0.29333333333333333, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "nextqa_oe", + "score": 0.3578947368421052, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "vibe-eval", + "score": 0.40714285714285703, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_summary", + "score": 0.07142857142857142, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "art_explanation", + "score": 0.08965517241379312, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "traffic_accident_analysis", + "score": 0.24999999999999997, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_content_follow_up", + "score": 0.24285714285714288, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Planning and Decision Making" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "electrocardiogram", + "score": 0.25, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_qa", + "score": 0.45, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "GUI_Chat_Easy", + "score": 0.4923076923076924, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funny_image_title", + "score": 0.45000000000000007, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "scibench_w_solution_open_ended", + "score": 0.032, + "eval_type": "llm", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "meme_explain", + "score": 0.2928571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "GUI_Chat_Hard", + "score": 0.3870967741935483, + "eval_type": "llm", + "num_demo": 1, + "num_query": 31, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funqa_unexpected_action_magic_video", + "score": 0.2333333333333333, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "unusual_images", + "score": 0.2517241379310345, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "guess_image_generation_prompt", + "score": 0.18421052631578946, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_detail_description", + "score": 0.09473684210526315, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "docci_image_description_long", + "score": 0.16428571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "ascii_art_30", + "score": 0.35714285714285715, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Spanish", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_French", + "score": 0.05714285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Russian", + "score": 0.04285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_English", + "score": 0.1, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Arabic", + "score": 0.0642857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Japanese", + "score": 0.06428571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "table_understanding_fetaqa", + "score": 0.34285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_advanced", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_expert", + "score": 0.11428571428571431, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_worldclass", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Spanish", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_French", + "score": 0.0642857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Russian", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Chinese", + "score": 0.12857142857142856, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Arabic", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "red_teaming_visual_order_B", + "score": 0.7157894736842105, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "red_teaming_captcha", + "score": 0.10000000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visualmisleading", + "score": 0.6421052631578948, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_celebrity", + "score": 0.7150000000000001, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_politics", + "score": 0.565, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_jailbreak", + "score": 0.29, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Text Recognition (OCR)", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visual_order_A", + "score": 0.668421052631579, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "red_teaming_racial", + "score": 0.5699999999999998, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + } +] \ No newline at end of file diff --git a/static/eval_results/Default/InternVL2_5_2B/summary_results.json b/static/eval_results/Default/InternVL2_5_2B/summary_results.json new file mode 100644 index 0000000000000000000000000000000000000000..f8718f5d302518ea84ef84b781f4f5270625aa50 --- /dev/null +++ b/static/eval_results/Default/InternVL2_5_2B/summary_results.json @@ -0,0 +1,251 @@ +{ + "model_summary": { + "core": { + "num_eval_tasks": 440, + "num_eval_samples": 6539, + "macro_mean_score": 0.17806821966478364, + "micro_mean_score": 0.17708809739236367 + }, + "open": { + "num_eval_tasks": 65, + "num_eval_samples": 1163, + "macro_mean_score": 0.2738430375585404, + "micro_mean_score": 0.2905417024935512 + }, + "overall_score": 0.19039567147289096 + }, + "keyword_stats": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.19614682488147464 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.18910947570579717 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.20543964378430513 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.23636598588530347 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.15691382827270517 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.28604169870255614 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.4248446115288219 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.18745928331343714 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.15097551654513372 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.030568378443583684 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.13898447520398388 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.13154711942685113 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.18343540213068474 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.20755556526976354 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.15983467048343838 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.26888883087046195 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.12906517409932386 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.14702422379343882 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.15324148486802894 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.19977956414542175 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.1665590610582109 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.2529339759528222 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.23420071687554841 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.09651832955404382 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.0784280378818194 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.21260786581183966 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.20755556526976354 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.138285387531761 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.20214332169825855 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.18128339685489062 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.053153113565753 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.12416116984428181 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.22449772657901465 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.3762336977650326 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.19222024833691936 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.25056132494721467 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.15596334442569906 + } + } + } +} \ No newline at end of file diff --git a/static/eval_results/Default/InternVL2_5_2B/task_results.json b/static/eval_results/Default/InternVL2_5_2B/task_results.json new file mode 100644 index 0000000000000000000000000000000000000000..34ebcde570112e313b86906890ff139daf9e7cc4 --- /dev/null +++ b/static/eval_results/Default/InternVL2_5_2B/task_results.json @@ -0,0 +1,7756 @@ +[ + { + "name": "graph_maxflow", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "arc_agi", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "long_string_letter_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "action_prediction", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "3d_indoor_scene_text_bbox_selection", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "photo_sharing_image_retrieval", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "emotion_recognition", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "6-8 images" + }, + { + "name": "code_error_line_identification", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "hotel_booking_confirmation_parsing", + "score": 0.3857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "brand_logo_recognition_and_elaboration", + "score": 0.48, + "eval_type": "rule", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "3d_indoor_scene_text_bbox_prediction", + "score": 0.04632755935026561, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "table_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "newspaper_page_parse_and_count", + "score": 0.3111111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "coco_person_detection", + "score": 0.08040063592083609, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "graph_hamiltonian_path", + "score": 0.38244047619047616, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "weather_info_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "cheapest_flight_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "game_info_parsing", + "score": 0.6688311688311688, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "mvsa_sentiment_classification", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "logo2k_same_type_logo_retrieval", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "coco_object_detection_by_query_property", + "score": 0.08458208458208459, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "perception_test_object_shuffle_video", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "landmark_recognition_and_qa", + "score": 0.17777777777777778, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "tqa_textbook_qa", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "electricity_load_estimate_plot", + "score": 0.05114285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "pmc_vqa_medical_image_qa", + "score": 0.42105263157894735, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "graph_shortest_path_planar", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "code_output_result", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "kvqa_knowledge_aware_qa", + "score": 0.3684210526315789, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "video_content_reasoning", + "score": 0.1111111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "stock_price_future_prediction", + "score": 0.21028571428571435, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "iconqa", + "score": 0.10526315789473684, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_biology", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_programming_test_easy", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 24, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "visualdial_visual_dialog_image_guessing", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "license_plate_recognition", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "math_convexity_value_estimation", + "score": 0.18817731556471845, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_analytic", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "famous_building_recognition", + "score": 0.4375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_transformation", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "human_relationship_reasoning", + "score": 0.375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Commonsense and Social Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "video_to_camera_trajectory_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "algebra", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "nextqa_mc", + "score": 0.9473684210526315, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "mnist_pattern", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "face_identity_matching", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "4-5 images" + }, + { + "name": "single_person_pose_estimation", + "score": 0.1174011354666419, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "movie_info_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "funsd_document_qa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "perception_test_video_character_order", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "movie_info_parsing", + "score": 0.375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "clevrer_video_moving_object_property_recognition", + "score": 0.125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "science_basic_physics", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "av_vehicle_multiview_counting", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "6-8 images" + }, + { + "name": "graph_theory", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ili_ratio_future_prediction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "nlvr2_two_image_compare_qa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "multi_load_type_prediction_from_plot", + "score": 0.4523809523809523, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "chart_vqa", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "stock_info_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "newspaper_ocr_in_query_box", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "quizlet_question_solving", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "math_breakpoint", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "coco_ood_global_image_retrieval_by_query_property", + "score": 0.06547619047619047, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "animal_pose_estimation", + "score": 0.013860848714248784, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "long_string_number_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "perception_test_video_action_count", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "video" + }, + { + "name": "IAM_line_ocr_and_locate", + "score": 0.10210634994992598, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "landmark_check_two_images", + "score": 0.04444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "av_view_identification", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "traffic_future_prediction_from_line_plot", + "score": 0.513578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "music_info_retrieval", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "clevrer_moving_direction_video", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "face_keypoint_detection", + "score": 0.4592092436351974, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_descriptive", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "muma_theory_of_mind_social_goal", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "places365_similar_scene_retrieval", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "graph_hamiltonian_cycle", + "score": 0.14047619047619048, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "science_molecule_chemistry", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multilingual_game_info_parsing", + "score": 0.29464285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_chordless_cycle", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "muma_theory_of_mind_belief_of_goal", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "clevrer_video_moving_object_count", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 21, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "video" + }, + { + "name": "star_object_interaction_video", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "exchange_rate_estimate_plot", + "score": 0.7091142857142856, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "vizwiz_quality_accessment_for_blind", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "av_human_multiview_counting", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "6-8 images" + }, + { + "name": "humor_understand_caption_match", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "vlnqa_egocentric_navigation_video", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "sta_action_localization_video", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "question_solution_solving", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "places365_scene_type_classification", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chess_winner_identification", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geometry_solid", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "egocentric_spatial_reasoning", + "score": 0.1111111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "electricity_future_prediction_from_table", + "score": 0.5842105263157894, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "chess_puzzle_single_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "top_rated_hotel_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "vibe_eval_short_phrase", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "graph_shortest_path_kamada_kawai", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_chemistry", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "insect_order_classification", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_physics", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "dvqa", + "score": 0.42105263157894735, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "electricity_plot_future_prediction", + "score": 0.35777368421052635, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "super_clevr", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "clevrer_object_existence_video", + "score": 0.6875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "figureqa", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "average_humidity_estimate_plot", + "score": 0.05999999999999999, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "clevr_arithmetic", + "score": 0.10526315789473684, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "math_parity", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "code_programming_extremely_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Planning and Decision Making" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "widerface_face_count_and_event_classification", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_programming_test_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "graph_isomorphism", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "code_programming_test_advanced", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "snli_ve_visual_entailment", + "score": 0.8666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "stock_info_parsing", + "score": 0.7226890756302522, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "egocentric_analysis_single_image", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "3d_fragments_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "physical_property_reasoning", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "cam_traj_to_video_selection", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "image_style_recognition", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geometry_length", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multilingual_movie_info_parsing", + "score": 0.2959183673469387, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "deciphering_oracle_bone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "action_sequence", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "9-image or more" + }, + { + "name": "handwritten_math_expression_extraction", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "youtube_video_info_parsing", + "score": 0.46428571428571425, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_area", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "map_diagram_qa", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "weather_info_parsing", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_connectivity", + "score": 0.11666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_math", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "game_info_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "av_multicamera_tracking_predict_bbox", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "9-image or more" + }, + { + "name": "signboard_identification", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "TV_show_info_parsing", + "score": 0.40476190476190477, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "waybill_number_sequence_extraction", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "music_info_parsing", + "score": 0.20535714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_segments_reordering", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "structured_output", + "num_input": "video" + }, + { + "name": "medical_cell_recognition", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "weather_map_climate_type_temperature_parsing", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "recipe_image_ordering", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "music_sheet_sentiment", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "waldo", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "google_streetview_direction_understanding", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "medical_abdomen_MRI_organ_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "geometry_reasoning_overlapped_circle", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_parasite_detection", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "medical_blood_vessels_recognition", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "photoshop_operation", + "score": 0.10714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "vln_identify_robot", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "9-image or more" + }, + { + "name": "logical_reasoning_find_odd_one_out", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MMMU_physics_chemistry_MCQ", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "visual_correspondance_in_two_images", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "MMMU_pro_exam_screenshot", + "score": 0.12121212121212122, + "eval_type": "rule", + "num_demo": 1, + "num_query": 99, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "vln_tegulu_next_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "planning_screenshot_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "play_go_capture_stone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "distinguish_ai_generated_image", + "score": 0.5263157894736842, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_match_problem", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "multiview_reasoning_camera_moving", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "paper_review_rating", + "score": 0.764197764824463, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "numerical_data", + "num_input": "4-5 images" + }, + { + "name": "highest_discount_game_price_identification", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_blocksworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "dish_ingredient_match", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "TV_show_retrieval_by_character", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "interpret_force_perspective_illusion", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "knowledge_graph_understanding", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "scibench_calculus_wo_solution", + "score": 0.02040816326530612, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "Ad_count_detection", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "google_streetview_circle_reasoning", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "music_sheet_format_QA", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_2D_views_of_3D_shapes", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "video_eval_visual_pref", + "score": 0.5625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "code_solution_compare", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "soccer_offside", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "signage_navigation", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "position_relationship", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "song_title_identification_from_lyrics", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "vln_identify_location", + "score": 0.030303030303030307, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "worldle", + "score": 0.049999999999999996, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "booking_web_recommendation", + "score": 0.2792517006802721, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "web_action_grounding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_nested_squares", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "location_vqa", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ascii_art_understanding", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "CLEVRER_physics", + "score": 0.3, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_visual_grippers", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "web_action_prediction", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "number_comparison", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_fit_pattern", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "code_translation_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "paper_review_acceptance", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "webpage_code_understanding", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "rocks_samples_compare", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "product_ocr_qa", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_visual_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "video_action_recognition", + "score": 0.10714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "structured_output", + "num_input": "video" + }, + { + "name": "semantic_matching_of_two_images", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "planning_visual_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "polygon_interior_angles", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "knowledge_sign_recognition", + "score": 0.1111111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "extract_webpage_headline", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "chinese_idiom_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_tyreworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_keywords_based_retrieval_non_radiology", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "move_pos_to_pos_hanoi_4_pole", + "score": 0.0503968253968254, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "2d_image_jigsaw_puzzle_easy", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "relative_depth_of_different_points", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "calendar_schedule_suggestion", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "medical_image_artifacts_indentification", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "video_intent_recognition", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "GUI_Act_Web_Multi", + "score": 0.10714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "music_sheet_name", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "code_add_tag", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "annoying_word_search", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "google_streetview_line_reasoning", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "scibench_fundamental_wo_solution", + "score": 0.02040816326530612, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "road_map_find_highway_between_two_place", + "score": 0.29411764705882354, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "chess_sygyzy_endgames", + "score": 0.05714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "video_camera_motion_description", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "exact_text", + "num_input": "video" + }, + { + "name": "media_homepage_profile", + "score": 0.0071428571428571435, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "TRANCE_physics_reasoning_basic", + "score": 0.23529411764705882, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "TRANCE_physics_reasoning_view", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "orchestra_score_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "cultural_vqa", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "medical_retrieval_given_surgeon_activity", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Videos", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "monthly_weather_days_count", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "remaining_playback_time_calculation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_count_line_intersections", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "Movie_retrieval_by_actor", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "google_streetview_line_sorting", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "GUI_Act_Web_Single", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "Bongard_Problem", + "score": 0.8157894736842105, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "medical_polyp_segmentation_single_object_rater", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "video_eval_factual_pref", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "game_platform_support_identification", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "mahjong", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_grid", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ishihara_test", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "paper_vqa", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "transit_map_intersection_points", + "score": 0.017857142857142856, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_translation_advanced", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "Forensic_Detection_of_different_images", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "top_video_creator_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "realworld_qa_en2cn", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_circled_letter", + "score": 0.17857142857142858, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_2d_folding", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "rebus", + "score": 0.043478260869565216, + "eval_type": "rule", + "num_demo": 1, + "num_query": 23, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_translation_Python", + "score": 0.0625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "recover_masked_word_in_figure", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "medical_content_based_retrieval_radiology", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "sign_language", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "planning_screenshot_grippers", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "music_sheet_author", + "score": 0.1875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "relative_reflectance_of_different_regions", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "font_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "TRANCE_physics_reasoning_event", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "media_recommend_solutions_stackoverflow", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "topological_sort", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "entertainment_web_game_style", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_visual_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "comic_page_ordering", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "planning_visual_storage", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "counting_multi_image", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "code_translation_easy", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "mensa_iq_test", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "image_translation_en2cn", + "score": 0.1615633519949754, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Mobile_tap", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "actor_recognition_in_Movie", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_execution", + "score": 0.0625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "medical_abdomen_endscopy_organ_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "hashtag_recommendation", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "multilingual_news_qa", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "LaTeX_complex_formula_convertion", + "score": 0.11764705882352941, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "vln_hindi_next_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "planning_visual_blocksworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "video_grounding_spatial", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "music_sheet_note_count", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "counting", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Mobile_swipe", + "score": 0.24841274279293252, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "icon_arithmetic_puzzle", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "google_streetview_circle_sorting", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "ancient_map_understanding", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "llavaguard", + "score": 0.39285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "circuit_diagram_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "media_QA_web_stackoverflow", + "score": 0.3095238095238095, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "painting_QA", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "character_recognition_in_TV_shows", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "video_eval_dynamic_pref", + "score": 0.375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "booking_web_rating", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "functionality_matching_in_different_objects", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "rocks_samples_identify", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "chess_find_legal_moves", + "score": 0.033620994446927885, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "mindmap_elements_parsing", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "flowchart_code_generation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "pokemon_3D_recognition", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "video_grounding_temporal", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "medical_counting_lymphocytes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "vln_english_next_step", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "code_visualization_output_understanding", + "score": 0.1, + "eval_type": "rule", + "num_demo": 1, + "num_query": 10, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "planning_screenshot_storage", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "autonomous_driving_scene_analysis", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "arxiv_vqa", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "healthcare_info_judgement", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Ethical and Safety Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "medical_multi_organ_segmentation_rater", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_asia", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_europe", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_africa", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_americas", + "score": 0.028571428571428574, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "symbolic_graphics_programs_computer_aided_design", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "symbolic_graphics_programs_scalable_vector_graphics", + "score": 0.16666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_depth", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_relation", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_count", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_distance", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geographic_remote_sensing_land_cover", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "visual_prediction_rater_semantic_segmentation", + "score": 0.0625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_surface_normal_estimation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_novel_view_synthesis", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_openable_part_segmentation", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_depth_estimation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_3d_assembled_quality_understanding", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "visual_prediction_rater_panoptic_segmentation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_plane_segmentation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "ocr_math_TheoremQA", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_MATH", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_blogpost", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_homepage", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_publication", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "reward_models_T2I_reward", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "reward_models_I2T_reward", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "poetry_custom_rhyming_scheme", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_haiku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_limerick", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic_alliteration", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_shakespearean_sonnet", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_petrarchian_sonnet_optional_meter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "maze_2d_8x8", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_school_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_text_latex", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_csv", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_math_equation", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_article_journal", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_skill_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_latex", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_html", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_markdown", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_employer_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_experience_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_article_authors", + "score": 0.39285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "panel_images_multi_question", + "score": 0.40476190476190477, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "panel_images_single_question", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ball_cup_swap_3", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_layout_understanding_excel", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_zoom", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_ppt", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_twitter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_youtube", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_alipay", + "score": 0.058823529411764705, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_instagram", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_notes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_word", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_leetcode", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_amazon", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_iphone_settings", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_tiktok", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "crossword_mini_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_equality", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_crushing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_checkmate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_contain_images", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_contain_length", + "score": 0.6, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_multi_contain_repeat_position_only_length", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_multi_contain_position_only", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_multi_contain_repeat", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_position_images", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_xor_images", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_position_length", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_repeat_length", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_length", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "autorater_unmask", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_mask", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_control", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_aesthetics", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_subject", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_semantics", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_motion_guided_editing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "autorater_artifact_reason", + "score": 0.6, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "autorater_artifact", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "autorater_3d_model_texturing", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Metrics", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "shape_composition_colours", + "score": 0.1304421768707483, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "shape_composition_shapes", + "score": 0.10374149659863945, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_motion_matching_3D_real", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "video_motion_matching_real_3D", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "memorization_chinese_celebrity", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_famous_treaty", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_indian_celebrity", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_papers", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_swap", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_clip_stable_diffusion_generate", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_background_change", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_attribute_edit", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_veracity", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_entity_replace", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_style", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_out_of_context", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "pictionary_genai_output_chinese", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_cartoon_drawing_guess", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_skribbl_io", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_doodle_guess", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_chinese_food_img2en", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "counterfactual_arithmetic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "table_understanding_fact_verification", + "score": 0.41666666666666663, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "table_understanding_complex_question_answering", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "number_puzzle_kakuro_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "number_puzzle_sudoku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MMSoc_HatefulMemes", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_PolitiFact", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_GossipCop", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Object Recognition and Classification", + "Scene and Event Understanding", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Memotion", + "score": 0.22352941176470592, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_instagram", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_leetcode", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_alipay", + "score": 0.11764705882352941, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_iphone_settings", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_zoom", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_ppt", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_notes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_excel", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_youtube", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_amazon", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_twitter", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_tiktok", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_word", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "ascii_art_30", + "score": 0.21428571428571427, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "generated_video_artifacts", + "score": 0.11250000000000002, + "eval_type": "llm", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "electrocardiogram", + "score": 0.2571428571428572, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_detail_description", + "score": 0.2210526315789474, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "doc_vqa", + "score": 0.6437499999999999, + "eval_type": "llm", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "4-5 images" + }, + { + "name": "video2notes", + "score": 0.05, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "guess_image_generation_prompt", + "score": 0.6, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "wikihow_complex_task_completion", + "score": 0.08888888888888888, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "9-image or more" + }, + { + "name": "video_qa", + "score": 0.5285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "sceneqa_scene_transition_video", + "score": 0.3071428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "table2latex_complex", + "score": 0.3222222222222222, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "defeasible_reasoning", + "score": 0.3620689655172413, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "meme_explain", + "score": 0.14285714285714288, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funny_image_title", + "score": 0.39285714285714285, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "vibe-eval", + "score": 0.35, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "image_captioning_with_additional_requirements", + "score": 0.4214285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_content_follow_up", + "score": 0.03571428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Planning and Decision Making" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "tweets_captioning", + "score": 0.2285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "visualization_with_code", + "score": 0.028571428571428574, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "funqa_unexpected_action_humor_video", + "score": 0.21333333333333337, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "unusual_images", + "score": 0.26896551724137924, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "GUI_Chat_Easy", + "score": 0.6346153846153848, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bar_chart_interpretation", + "score": 0.17241379310344832, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "iq_test", + "score": 0.07931034482758621, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "figurative_speech_explanation", + "score": 0.3, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_short_title", + "score": 0.4285714285714285, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "funqa_unexpected_action_creative_video", + "score": 0.21333333333333335, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "docci_image_description_long", + "score": 0.6285714285714287, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "traffic_accident_analysis", + "score": 0.10714285714285716, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "scibench_w_solution_open_ended", + "score": 0.07600000000000001, + "eval_type": "llm", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "ocrqa", + "score": 0.41379310344827597, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "paper_review_writing", + "score": 0.22, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "4-5 images" + }, + { + "name": "nextqa_oe", + "score": 0.24736842105263163, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "image_humor_understanding", + "score": 0.38965517241379305, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "graph_interpretation", + "score": 0.20344827586206896, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "activitynetqa", + "score": 0.4526315789473684, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "humor_explanation", + "score": 0.28, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funqa_unexpected_action_magic_video", + "score": 0.3133333333333334, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "GUI_Chat_Hard", + "score": 0.48064516129032264, + "eval_type": "llm", + "num_demo": 1, + "num_query": 31, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_summary", + "score": 0.1642857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "science_figure_explanation", + "score": 0.12068965517241381, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "art_explanation", + "score": 0.18620689655172415, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Spanish", + "score": 0.042857142857142864, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Chinese", + "score": 0.08571428571428572, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Russian", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Arabic", + "score": 0.03571428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_French", + "score": 0.014285714285714287, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Russian", + "score": 0.021428571428571432, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_English", + "score": 0.06428571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Arabic", + "score": 0.028571428571428574, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_French", + "score": 0.09999999999999999, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Spanish", + "score": 0.014285714285714287, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Japanese", + "score": 0.021428571428571432, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "table_understanding_fetaqa", + "score": 0.44285714285714295, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visual_order_B", + "score": 0.6631578947368422, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "red_teaming_captcha", + "score": 0.11052631578947371, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_jailbreak", + "score": 0.51, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Text Recognition (OCR)", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_racial", + "score": 0.73, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visual_order_A", + "score": 0.7157894736842106, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "red_teaming_celebrity", + "score": 0.7, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_politics", + "score": 0.615, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visualmisleading", + "score": 0.7105263157894738, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_advanced", + "score": 0.11428571428571431, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_worldclass", + "score": 0.021428571428571432, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_expert", + "score": 0.13571428571428573, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + } +] \ No newline at end of file diff --git a/static/eval_results/Default/InternVL2_5_78B/summary_results.json b/static/eval_results/Default/InternVL2_5_78B/summary_results.json new file mode 100644 index 0000000000000000000000000000000000000000..055e16e870658b5ad33e53b229171e8ec80d837e --- /dev/null +++ b/static/eval_results/Default/InternVL2_5_78B/summary_results.json @@ -0,0 +1,251 @@ +{ + "model_summary": { + "core": { + "num_eval_tasks": 440, + "num_eval_samples": 6539, + "macro_mean_score": 0.44132952988532753, + "micro_mean_score": 0.4397079059379812 + }, + "open": { + "num_eval_tasks": 65, + "num_eval_samples": 1163, + "macro_mean_score": 0.5538024772749066, + "micro_mean_score": 0.5776870163370592 + }, + "overall_score": 0.4558062458859664 + }, + "keyword_stats": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.46893853078050696 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.5220829627238773 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.4933134095077618 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.477971701185214 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.3936387335462224 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.5610278744213835 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.6072907268170428 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.44533550848682696 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.3548055654857457 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.22852234519925363 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.4910486370158392 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.39410061025954557 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.43424133240430957 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.5300255483670417 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.4793195260560365 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.4622918421665308 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.3729954065847296 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.4226567593431527 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.4149806887502539 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.4904285184890861 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.4348674018783908 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.5124942746906233 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.4717682857925982 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.20496909081092754 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.4184724897299287 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.4951997132559491 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.5300255483670417 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.286105084660728 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.39635000103107665 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.5401547630322637 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.26403470419652064 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.3933356676003734 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.5168098196770042 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.47731479110938463 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.4388571290145052 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.5034762755043025 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.37742798395328586 + } + } + } +} \ No newline at end of file diff --git a/static/eval_results/Default/InternVL2_5_78B/task_results.json b/static/eval_results/Default/InternVL2_5_78B/task_results.json new file mode 100644 index 0000000000000000000000000000000000000000..3604ee9bfb5e3f354756eec202184714946b91da --- /dev/null +++ b/static/eval_results/Default/InternVL2_5_78B/task_results.json @@ -0,0 +1,7756 @@ +[ + { + "name": "face_identity_matching", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "4-5 images" + }, + { + "name": "IAM_line_ocr_and_locate", + "score": 0.6517117230612683, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "science_molecule_chemistry", + "score": 0.8666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "weather_info_retrieval", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "signboard_identification", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "av_multicamera_tracking_predict_bbox", + "score": 0.012684730303418308, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "9-image or more" + }, + { + "name": "funsd_document_qa", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "physical_property_reasoning", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "geometry_area", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geometry_analytic", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "3d_indoor_scene_text_bbox_selection", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "map_diagram_qa", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "clevr_arithmetic", + "score": 0.6842105263157895, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "graph_connectivity", + "score": 0.5833333333333334, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "visualdial_visual_dialog_image_guessing", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "graph_isomorphism", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "clevrer_object_existence_video", + "score": 0.875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "question_solution_solving", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "snli_ve_visual_entailment", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_chemistry", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ili_ratio_future_prediction", + "score": 0.021928571428571488, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "image_style_recognition", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "multilingual_movie_info_parsing", + "score": 0.6530612244897959, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "star_object_interaction_video", + "score": 0.5625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "video_content_reasoning", + "score": 0.7777777777777778, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "geometry_transformation", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "places365_similar_scene_retrieval", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "vibe_eval_short_phrase", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "cam_traj_to_video_selection", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "math_breakpoint", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "photo_sharing_image_retrieval", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "perception_test_video_character_order", + "score": 0.8125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "coco_ood_global_image_retrieval_by_query_property", + "score": 0.6583616780045353, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "arc_agi", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "vizwiz_quality_accessment_for_blind", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "av_human_multiview_counting", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "6-8 images" + }, + { + "name": "waybill_number_sequence_extraction", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "emotion_recognition", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "6-8 images" + }, + { + "name": "brand_logo_recognition_and_elaboration", + "score": 0.74, + "eval_type": "rule", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "license_plate_recognition", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chess_puzzle_single_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_programming_test_easy", + "score": 0.5694444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 24, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_winner_identification", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chart_vqa", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "math_convexity_value_estimation", + "score": 0.45210923844984136, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "3d_indoor_scene_text_bbox_prediction", + "score": 0.11345367411269795, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "long_string_letter_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "dvqa", + "score": 0.8947368421052632, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "egocentric_spatial_reasoning", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "music_info_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "quizlet_question_solving", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "coco_person_detection", + "score": 0.7291575502542685, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "clevrer_moving_direction_video", + "score": 0.5625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "graph_maxflow", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "tqa_textbook_qa", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "game_info_parsing", + "score": 0.9155844155844154, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_hamiltonian_cycle", + "score": 0.14264455782312926, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "av_vehicle_multiview_counting", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "6-8 images" + }, + { + "name": "animal_pose_estimation", + "score": 0.24406600635762046, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "humor_understand_caption_match", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "table_understanding", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "super_clevr", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "average_humidity_estimate_plot", + "score": 0.6906666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "nlvr2_two_image_compare_qa", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "sta_action_localization_video", + "score": 0.5625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "face_keypoint_detection", + "score": 0.43826443800341836, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "insect_order_classification", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "deciphering_oracle_bone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "places365_scene_type_classification", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "science_basic_physics", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "perception_test_object_shuffle_video", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "long_string_number_recognition", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_programming_test_hard", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "handwritten_math_expression_extraction", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "video_to_camera_trajectory_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "cheapest_flight_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "newspaper_ocr_in_query_box", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "stock_info_retrieval", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "graph_theory", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "landmark_recognition_and_qa", + "score": 0.4666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "muma_theory_of_mind_social_goal", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "clevrer_video_moving_object_property_recognition", + "score": 0.9375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "action_sequence", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "9-image or more" + }, + { + "name": "game_info_retrieval", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "av_view_identification", + "score": 0.21111111111111108, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "code_programming_test_advanced", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "muma_theory_of_mind_belief_of_goal", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "code_programming_extremely_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Planning and Decision Making" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "geometry_solid", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "top_rated_hotel_identification", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "stock_info_parsing", + "score": 0.96218487394958, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "pmc_vqa_medical_image_qa", + "score": 0.7368421052631579, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "3d_fragments_understanding", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "electricity_future_prediction_from_table", + "score": 0.6914210526315789, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "TV_show_info_parsing", + "score": 0.8253968253968257, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "coco_object_detection_by_query_property", + "score": 0.7769182644835021, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "vlnqa_egocentric_navigation_video", + "score": 0.3125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "landmark_check_two_images", + "score": 0.6888888888888888, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "code_output_result", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "figureqa", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_biology", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "traffic_future_prediction_from_line_plot", + "score": 0.6821578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "mvsa_sentiment_classification", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "stock_price_future_prediction", + "score": 0.7976428571428572, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "kvqa_knowledge_aware_qa", + "score": 0.5263157894736842, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multi_load_type_prediction_from_plot", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "hotel_booking_confirmation_parsing", + "score": 0.6642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_shortest_path_planar", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "action_prediction", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "code_error_line_identification", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "math_parity", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "famous_building_recognition", + "score": 0.875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "multilingual_game_info_parsing", + "score": 0.7767857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "weather_info_parsing", + "score": 0.9206349206349208, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_shortest_path_kamada_kawai", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "widerface_face_count_and_event_classification", + "score": 0.6071428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "nextqa_mc", + "score": 0.9473684210526315, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "ti_fused_vqa_physics", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "movie_info_retrieval", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "graph_chordless_cycle", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "clevrer_video_moving_object_count", + "score": 0.7619047619047619, + "eval_type": "rule", + "num_demo": 1, + "num_query": 21, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "video" + }, + { + "name": "geometry_length", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "algebra", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "human_relationship_reasoning", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Commonsense and Social Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "exchange_rate_estimate_plot", + "score": 0.9791142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "electricity_load_estimate_plot", + "score": 0.6518571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "logo2k_same_type_logo_retrieval", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "single_person_pose_estimation", + "score": 0.2189325582280229, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_math", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "electricity_plot_future_prediction", + "score": 0.8375210526315788, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "graph_hamiltonian_path", + "score": 0.19583333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "iconqa", + "score": 0.3684210526315789, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "movie_info_parsing", + "score": 0.6160714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "youtube_video_info_parsing", + "score": 0.8095238095238095, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "egocentric_analysis_single_image", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "perception_test_video_action_count", + "score": 0.6875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "video" + }, + { + "name": "music_info_parsing", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "mnist_pattern", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "newspaper_page_parse_and_count", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_descriptive", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_relation", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_depth", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_distance", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_count", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geographic_remote_sensing_land_cover", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "symbolic_graphics_programs_computer_aided_design", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "symbolic_graphics_programs_scalable_vector_graphics", + "score": 0.16666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_asia", + "score": 0.21428571428571425, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_americas", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_europe", + "score": 0.20000000000000004, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_africa", + "score": 0.08571428571428572, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "visual_prediction_rater_plane_segmentation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_novel_view_synthesis", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_3d_assembled_quality_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "visual_prediction_rater_semantic_segmentation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_panoptic_segmentation", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_surface_normal_estimation", + "score": 0.5476190476190477, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_depth_estimation", + "score": 0.3095238095238095, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_openable_part_segmentation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "medical_abdomen_MRI_organ_recognition", + "score": 0.3392857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "visual_correspondance_in_two_images", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "google_streetview_circle_reasoning", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "vln_tegulu_next_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "flowchart_code_generation", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "video_eval_factual_pref", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "medical_polyp_segmentation_single_object_rater", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "recipe_image_ordering", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "healthcare_info_judgement", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Ethical and Safety Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "relative_depth_of_different_points", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "planning_visual_storage", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_visualization_output_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 10, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "Ad_count_detection", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "TRANCE_physics_reasoning_view", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "video_grounding_temporal", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "chess_sygyzy_endgames", + "score": 0.04761904761904762, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_visual_blocksworld", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "web_action_grounding", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "position_relationship", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Web_Multi", + "score": 0.4642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "webpage_code_understanding", + "score": 0.8888888888888888, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "vln_english_next_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "logical_reasoning_2d_folding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "extract_webpage_headline", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_translation_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "music_sheet_note_count", + "score": 0.058823529411764705, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_circled_letter", + "score": 0.6785714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_find_odd_one_out", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "paper_vqa", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "medical_abdomen_endscopy_organ_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "worldle", + "score": 0.2941859731424949, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "medical_keywords_based_retrieval_non_radiology", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "remaining_playback_time_calculation", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "sign_language", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "video_camera_motion_description", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "exact_text", + "num_input": "video" + }, + { + "name": "video_segments_reordering", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "structured_output", + "num_input": "video" + }, + { + "name": "interpret_force_perspective_illusion", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Web_Single", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "hashtag_recommendation", + "score": 0.9107142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_tyreworld", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_grounding_spatial", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "weather_map_climate_type_temperature_parsing", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_match_problem", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "music_sheet_sentiment", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "scibench_calculus_wo_solution", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "MMMU_physics_chemistry_MCQ", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_eval_visual_pref", + "score": 0.5625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "image_translation_en2cn", + "score": 0.437942009022375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "game_platform_support_identification", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "music_sheet_format_QA", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "cultural_vqa", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "CLEVRER_physics", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "location_vqa", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "semantic_matching_of_two_images", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "planning_visual_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "geometry_reasoning_nested_squares", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "circuit_diagram_understanding", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "scibench_fundamental_wo_solution", + "score": 0.2653061224489796, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_visual_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "planning_visual_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "code_retrieval", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "medical_cell_recognition", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_barman", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_blocksworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_storage", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "distinguish_ai_generated_image", + "score": 0.7894736842105263, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_count_line_intersections", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_grippers", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "booking_web_recommendation", + "score": 0.6312074829931973, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "photoshop_operation", + "score": 0.23333333333333334, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "llavaguard", + "score": 0.6785714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "media_homepage_profile", + "score": 0.21370829033367733, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "vln_hindi_next_step", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "media_recommend_solutions_stackoverflow", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "dish_ingredient_match", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "medical_retrieval_given_surgeon_activity", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Videos", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "google_streetview_direction_understanding", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "music_sheet_name", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "code_translation_easy", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "ancient_map_understanding", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "mindmap_elements_parsing", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_add_tag", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "autonomous_driving_scene_analysis", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_2D_views_of_3D_shapes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "paper_review_acceptance", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "rocks_samples_compare", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "realworld_qa_en2cn", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "google_streetview_line_reasoning", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "orchestra_score_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "highest_discount_game_price_identification", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "video_intent_recognition", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "counting", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "media_QA_web_stackoverflow", + "score": 0.6666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_solution_compare", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "soccer_offside", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "2d_image_jigsaw_puzzle_easy", + "score": 0.2119047619047619, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "medical_content_based_retrieval_radiology", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "song_title_identification_from_lyrics", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MMMU_pro_exam_screenshot", + "score": 0.32323232323232326, + "eval_type": "rule", + "num_demo": 1, + "num_query": 99, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "multiview_reasoning_camera_moving", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "medical_multi_organ_segmentation_rater", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "knowledge_graph_understanding", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "rocks_samples_identify", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "mensa_iq_test", + "score": 0.4323529411764706, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "play_go_capture_stone", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "medical_blood_vessels_recognition", + "score": 0.5357142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "booking_web_rating", + "score": 0.39285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "web_action_prediction", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "arxiv_vqa", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "top_video_creator_identification", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_visual_grippers", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "annoying_word_search", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Mobile_tap", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "comic_page_ordering", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "geometry_reasoning_grid", + "score": 0.6071428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "TRANCE_physics_reasoning_event", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "planning_screenshot_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Mobile_swipe", + "score": 0.6045345424369317, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "recover_masked_word_in_figure", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "character_recognition_in_TV_shows", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "vln_identify_robot", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "9-image or more" + }, + { + "name": "multilingual_news_qa", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "move_pos_to_pos_hanoi_4_pole", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "relative_reflectance_of_different_regions", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "counting_multi_image", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "code_translation_advanced", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "medical_image_artifacts_indentification", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "signage_navigation", + "score": 0.8666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "transit_map_intersection_points", + "score": 0.48214285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_execution", + "score": 0.4375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "icon_arithmetic_puzzle", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_overlapped_circle", + "score": 0.75, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "google_streetview_circle_sorting", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "chinese_idiom_recognition", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "knowledge_sign_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "monthly_weather_days_count", + "score": 0.4523809523809523, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "calendar_schedule_suggestion", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "painting_QA", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "music_sheet_author", + "score": 0.3125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "ascii_art_understanding", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "chess_find_legal_moves", + "score": 0.014937888198757762, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "topological_sort", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "entertainment_web_game_style", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_action_recognition", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "structured_output", + "num_input": "video" + }, + { + "name": "functionality_matching_in_different_objects", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "Movie_retrieval_by_actor", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "TV_show_retrieval_by_character", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "Forensic_Detection_of_different_images", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "polygon_interior_angles", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "pokemon_3D_recognition", + "score": 0.75, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "mahjong", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "font_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "paper_review_rating", + "score": 0.6423672507591217, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "numerical_data", + "num_input": "4-5 images" + }, + { + "name": "number_comparison", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "medical_counting_lymphocytes", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "actor_recognition_in_Movie", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "TRANCE_physics_reasoning_basic", + "score": 0.8823529411764706, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "waldo", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "google_streetview_line_sorting", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "video_eval_dynamic_pref", + "score": 0.75, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "product_ocr_qa", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "Bongard_Problem", + "score": 0.2456140350877193, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_translation_Python", + "score": 0.41666666666666663, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "logical_reasoning_fit_pattern", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "road_map_find_highway_between_two_place", + "score": 0.6470588235294118, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ishihara_test", + "score": 0.4428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_parasite_detection", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "LaTeX_complex_formula_convertion", + "score": 0.47058823529411764, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "vln_identify_location", + "score": 0.1515151515151515, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "rebus", + "score": 0.2608695652173913, + "eval_type": "rule", + "num_demo": 1, + "num_query": 23, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "constrained_generation_multi_contain_repeat_position_only_length", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_multi_contain_repeat", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_position_length", + "score": 0.4, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_xor_images", + "score": 0.8666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_position_images", + "score": 0.6, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_repeat_length", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_contain_length", + "score": 1.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_multi_contain_position_only", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_contain_images", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_length", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_equality", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_crushing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_checkmate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "panel_images_single_question", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "panel_images_multi_question", + "score": 0.8095238095238095, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_tiktok", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_notes", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_alipay", + "score": 0.5882352941176471, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_amazon", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_instagram", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_ppt", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_leetcode", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_twitter", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_zoom", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_iphone_settings", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_youtube", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_excel", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_word", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "crossword_mini_5x5", + "score": 0.32857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MMSoc_HatefulMemes", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_GossipCop", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Object Recognition and Classification", + "Scene and Event Understanding", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_PolitiFact", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Memotion", + "score": 0.6470588235294119, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "reward_models_I2T_reward", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "reward_models_T2I_reward", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "ocr_math_MATH", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "ocr_math_TheoremQA", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "counterfactual_arithmetic", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_homepage", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_publication", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_blogpost", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "table_understanding_complex_question_answering", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "table_understanding_fact_verification", + "score": 0.857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_swap", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_style", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_clip_stable_diffusion_generate", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_veracity", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_attribute_edit", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_entity_replace", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_out_of_context", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_background_change", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "video_motion_matching_3D_real", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "video_motion_matching_real_3D", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "number_puzzle_kakuro_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "number_puzzle_sudoku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ball_cup_swap_3", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "maze_2d_8x8", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "shape_composition_shapes", + "score": 0.3770408163265306, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "shape_composition_colours", + "score": 0.5488378684807256, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_chinese_celebrity", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_indian_celebrity", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_papers", + "score": 0.6333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_famous_treaty", + "score": 0.6071428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_csv", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_school_plain", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_text_latex", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_skill_plain", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_experience_plain", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_article_authors", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_markdown", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_html", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_employer_plain", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_equation", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_article_journal", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_latex", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "autorater_subject", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_artifact", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "autorater_control", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_aesthetics", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_mask", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_artifact_reason", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "autorater_semantics", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_motion_guided_editing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "autorater_3d_model_texturing", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Metrics", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_unmask", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "poetry_haiku", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_limerick", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_petrarchian_sonnet_optional_meter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_custom_rhyming_scheme", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic_alliteration", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_shakespearean_sonnet", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "pictionary_cartoon_drawing_guess", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_doodle_guess", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_skribbl_io", + "score": 0.15, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_genai_output_chinese", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_chinese_food_img2en", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_tiktok", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_notes", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_alipay", + "score": 0.8235294117647058, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_amazon", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_instagram", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_ppt", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_leetcode", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_twitter", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_zoom", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_iphone_settings", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_youtube", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_excel", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_word", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "scibench_w_solution_open_ended", + "score": 0.386, + "eval_type": "llm", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "electrocardiogram", + "score": 0.2928571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funny_image_title", + "score": 0.6642857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "image_captioning_with_additional_requirements", + "score": 0.9142857142857144, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "meme_explain", + "score": 0.6642857142857144, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "image_humor_understanding", + "score": 0.710344827586207, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "doc_vqa", + "score": 0.7125, + "eval_type": "llm", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "4-5 images" + }, + { + "name": "generated_video_artifacts", + "score": 0.19999999999999998, + "eval_type": "llm", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "sceneqa_scene_transition_video", + "score": 0.23571428571428574, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "nextqa_oe", + "score": 0.3999999999999999, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "tweets_captioning", + "score": 0.5785714285714285, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "defeasible_reasoning", + "score": 0.7241379310344827, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "vibe-eval", + "score": 0.5499999999999999, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "ocrqa", + "score": 0.8172413793103448, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_content_follow_up", + "score": 0.7571428571428572, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Planning and Decision Making" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "figurative_speech_explanation", + "score": 0.7344827586206897, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "wikihow_complex_task_completion", + "score": 0.6222222222222222, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "9-image or more" + }, + { + "name": "docci_image_description_long", + "score": 0.7785714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_detail_description", + "score": 0.4052631578947368, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "video2notes", + "score": 0.5785714285714285, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "funqa_unexpected_action_magic_video", + "score": 0.43333333333333335, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "video_qa", + "score": 0.7928571428571429, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "bar_chart_interpretation", + "score": 0.6448275862068965, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_summary", + "score": 0.6214285714285716, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "iq_test", + "score": 0.5827586206896552, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funqa_unexpected_action_creative_video", + "score": 0.26666666666666666, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "unusual_images", + "score": 0.7689655172413794, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "GUI_Chat_Hard", + "score": 0.45806451612903226, + "eval_type": "llm", + "num_demo": 1, + "num_query": 31, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "graph_interpretation", + "score": 0.7862068965517242, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "traffic_accident_analysis", + "score": 0.5142857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "humor_explanation", + "score": 0.6866666666666666, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funqa_unexpected_action_humor_video", + "score": 0.3266666666666667, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "video_short_title", + "score": 0.6714285714285715, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "GUI_Chat_Easy", + "score": 0.7538461538461539, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "table2latex_complex", + "score": 0.6666666666666667, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "activitynetqa", + "score": 0.4947368421052632, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "visualization_with_code", + "score": 0.3928571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "science_figure_explanation", + "score": 0.7724137931034483, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "guess_image_generation_prompt", + "score": 0.836842105263158, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "art_explanation", + "score": 0.5655172413793103, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "paper_review_writing", + "score": 0.52, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "4-5 images" + }, + { + "name": "bridge_strategies_worldclass", + "score": 0.32857142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_expert", + "score": 0.28571428571428575, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_advanced", + "score": 0.24999999999999997, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Spanish", + "score": 0.5714285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Arabic", + "score": 0.55, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Japanese", + "score": 0.2857142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Russian", + "score": 0.4857142857142856, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_English", + "score": 0.6571428571428573, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_French", + "score": 0.5428571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "table_understanding_fetaqa", + "score": 0.6142857142857142, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visual_order_B", + "score": 0.9, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "red_teaming_racial", + "score": 0.7750000000000001, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_captcha", + "score": 0.10000000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_celebrity", + "score": 0.7400000000000003, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visual_order_A", + "score": 0.9, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "red_teaming_politics", + "score": 0.6949999999999998, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visualmisleading", + "score": 0.8157894736842107, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_jailbreak", + "score": 0.8450000000000001, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Text Recognition (OCR)", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Spanish", + "score": 0.19285714285714287, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Arabic", + "score": 0.2642857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Russian", + "score": 0.2785714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_French", + "score": 0.2571428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Chinese", + "score": 0.3071428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "ascii_art_30", + "score": 0.07142857142857142, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + } +] \ No newline at end of file diff --git a/static/eval_results/Default/InternVL2_76B/summary_results.json b/static/eval_results/Default/InternVL2_76B/summary_results.json new file mode 100644 index 0000000000000000000000000000000000000000..2c244cef39cdaadb040968fc9007b1a1307168c5 --- /dev/null +++ b/static/eval_results/Default/InternVL2_76B/summary_results.json @@ -0,0 +1,251 @@ +{ + "model_summary": { + "core": { + "num_eval_tasks": 440, + "num_eval_samples": 6539, + "macro_mean_score": 0.3562710424410931, + "micro_mean_score": 0.35129859801162616 + }, + "open": { + "num_eval_tasks": 65, + "num_eval_samples": 1163, + "macro_mean_score": 0.5192997443033639, + "micro_mean_score": 0.5421324161650903 + }, + "overall_score": 0.3772549347599992 + }, + "keyword_stats": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.38193012983650343 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.41315219763443384 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.43665980552577693 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.4265623936500962 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.2975890791763991 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.5257990949897898 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.5779473684210527 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.33287081421166276 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.2949505390920417 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.17036496432397477 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.3634339625985008 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.31396468806559114 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.3473756113126343 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.395893002855977 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.44982107744035305 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.42875248733027654 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.2868239162778749 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.3630499545707523 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.3476691827105281 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.3943337471922549 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.29244088978470345 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.45822072478616577 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.3879326330400817 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.20309901738473166 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.34771123515123364 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.4145693044465943 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.395893002855977 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.24403942809507134 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.3153417935059416 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.4306947454508794 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.2132321995754061 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.2953329718984368 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.42202934355552685 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.47409276729986083 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.30014798153766264 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.4625649385962016 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.2868813944130515 + } + } + } +} \ No newline at end of file diff --git a/static/eval_results/Default/InternVL2_76B/task_results.json b/static/eval_results/Default/InternVL2_76B/task_results.json new file mode 100644 index 0000000000000000000000000000000000000000..c1438f997ba8a34e91d10a3eca49d3091dd91ca3 --- /dev/null +++ b/static/eval_results/Default/InternVL2_76B/task_results.json @@ -0,0 +1,7756 @@ +[ + { + "name": "face_identity_matching", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "4-5 images" + }, + { + "name": "IAM_line_ocr_and_locate", + "score": 0.5937981812316329, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "science_molecule_chemistry", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "weather_info_retrieval", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "signboard_identification", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "av_multicamera_tracking_predict_bbox", + "score": 0.021818162950542508, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "9-image or more" + }, + { + "name": "funsd_document_qa", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "physical_property_reasoning", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "geometry_area", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geometry_analytic", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "map_diagram_qa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "clevr_arithmetic", + "score": 0.5263157894736842, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "graph_connectivity", + "score": 0.4166666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "visualdial_visual_dialog_image_guessing", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "graph_isomorphism", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "clevrer_object_existence_video", + "score": 0.5625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "question_solution_solving", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "snli_ve_visual_entailment", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_chemistry", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ili_ratio_future_prediction", + "score": 0.07099999999999997, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "image_style_recognition", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "multilingual_movie_info_parsing", + "score": 0.5612244897959183, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "star_object_interaction_video", + "score": 0.4375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "video_content_reasoning", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "geometry_transformation", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "places365_similar_scene_retrieval", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "vibe_eval_short_phrase", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "cam_traj_to_video_selection", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "math_breakpoint", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "photo_sharing_image_retrieval", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "perception_test_video_character_order", + "score": 0.6875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "coco_ood_global_image_retrieval_by_query_property", + "score": 0.6527777777777778, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "arc_agi", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "vizwiz_quality_accessment_for_blind", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "av_human_multiview_counting", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "6-8 images" + }, + { + "name": "waybill_number_sequence_extraction", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "emotion_recognition", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "6-8 images" + }, + { + "name": "brand_logo_recognition_and_elaboration", + "score": 0.58, + "eval_type": "rule", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "license_plate_recognition", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chess_puzzle_single_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_programming_test_easy", + "score": 0.125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 24, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_winner_identification", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chart_vqa", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "math_convexity_value_estimation", + "score": 0.4633266171344593, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "3d_indoor_scene_text_bbox_prediction", + "score": 0.04091742079331852, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "long_string_letter_recognition", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "dvqa", + "score": 0.5789473684210527, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "egocentric_spatial_reasoning", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "music_info_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "quizlet_question_solving", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "coco_person_detection", + "score": 0.4056667566073084, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "clevrer_moving_direction_video", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "graph_maxflow", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "game_info_parsing", + "score": 0.9155844155844154, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_hamiltonian_cycle", + "score": 0.2642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "av_vehicle_multiview_counting", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "6-8 images" + }, + { + "name": "animal_pose_estimation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "humor_understand_caption_match", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "super_clevr", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "average_humidity_estimate_plot", + "score": 0.5920000000000002, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "nlvr2_two_image_compare_qa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "sta_action_localization_video", + "score": 0.4375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "face_keypoint_detection", + "score": 0.3876035519415914, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "insect_order_classification", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "deciphering_oracle_bone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "places365_scene_type_classification", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "science_basic_physics", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "perception_test_object_shuffle_video", + "score": 0.375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "long_string_number_recognition", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_programming_test_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "video_to_camera_trajectory_retrieval", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "cheapest_flight_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "newspaper_ocr_in_query_box", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "stock_info_retrieval", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "graph_theory", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "landmark_recognition_and_qa", + "score": 0.5111111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "muma_theory_of_mind_social_goal", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "clevrer_video_moving_object_property_recognition", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "action_sequence", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "9-image or more" + }, + { + "name": "game_info_retrieval", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "av_view_identification", + "score": 0.02222222222222222, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "code_programming_test_advanced", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "muma_theory_of_mind_belief_of_goal", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "code_programming_extremely_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Planning and Decision Making" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "geometry_solid", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "top_rated_hotel_identification", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "stock_info_parsing", + "score": 0.9327731092436977, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "pmc_vqa_medical_image_qa", + "score": 0.5263157894736842, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "3d_fragments_understanding", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "electricity_future_prediction_from_table", + "score": 0.5457894736842105, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "TV_show_info_parsing", + "score": 0.7936507936507937, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "coco_object_detection_by_query_property", + "score": 0.39285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "vlnqa_egocentric_navigation_video", + "score": 0.3125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "landmark_check_two_images", + "score": 0.42222222222222233, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "figureqa", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_biology", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "traffic_future_prediction_from_line_plot", + "score": 0.614578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "mvsa_sentiment_classification", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "stock_price_future_prediction", + "score": 0.8522857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multi_load_type_prediction_from_plot", + "score": 0.4285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "hotel_booking_confirmation_parsing", + "score": 0.5928571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_shortest_path_planar", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "action_prediction", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "code_error_line_identification", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "math_parity", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "famous_building_recognition", + "score": 0.84375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "multilingual_game_info_parsing", + "score": 0.6071428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "weather_info_parsing", + "score": 0.9007936507936509, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_shortest_path_kamada_kawai", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "widerface_face_count_and_event_classification", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "nextqa_mc", + "score": 0.8947368421052632, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "ti_fused_vqa_physics", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "movie_info_retrieval", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "graph_chordless_cycle", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "clevrer_video_moving_object_count", + "score": 0.23809523809523808, + "eval_type": "rule", + "num_demo": 1, + "num_query": 21, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "video" + }, + { + "name": "geometry_length", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "algebra", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "human_relationship_reasoning", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Commonsense and Social Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "exchange_rate_estimate_plot", + "score": 0.9578785714285712, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "electricity_load_estimate_plot", + "score": 0.46228571428571424, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "logo2k_same_type_logo_retrieval", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "single_person_pose_estimation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_math", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "electricity_plot_future_prediction", + "score": 0.5137842105263157, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "graph_hamiltonian_path", + "score": 0.369047619047619, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "iconqa", + "score": 0.2631578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "movie_info_parsing", + "score": 0.6339285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "youtube_video_info_parsing", + "score": 0.8214285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "egocentric_analysis_single_image", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "perception_test_video_action_count", + "score": 0.125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "video" + }, + { + "name": "music_info_parsing", + "score": 0.5625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "mnist_pattern", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "newspaper_page_parse_and_count", + "score": 0.35555555555555557, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_descriptive", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_relation", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_depth", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_distance", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_count", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geographic_remote_sensing_land_cover", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "symbolic_graphics_programs_computer_aided_design", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "symbolic_graphics_programs_scalable_vector_graphics", + "score": 0.2777777777777778, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_asia", + "score": 0.042857142857142864, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_americas", + "score": 0.05714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_europe", + "score": 0.014285714285714287, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_africa", + "score": 0.042857142857142864, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "visual_prediction_rater_plane_segmentation", + "score": 0.3555555555555555, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_novel_view_synthesis", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_3d_assembled_quality_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "visual_prediction_rater_semantic_segmentation", + "score": 0.1875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_panoptic_segmentation", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_surface_normal_estimation", + "score": 0.3095238095238095, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_depth_estimation", + "score": 0.3095238095238095, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_openable_part_segmentation", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_correspondance_in_two_images", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "google_streetview_circle_reasoning", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "vln_tegulu_next_step", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "flowchart_code_generation", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "video_eval_factual_pref", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "medical_polyp_segmentation_single_object_rater", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "recipe_image_ordering", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "healthcare_info_judgement", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Ethical and Safety Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "relative_depth_of_different_points", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "planning_visual_storage", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_visualization_output_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 10, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "Ad_count_detection", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "TRANCE_physics_reasoning_view", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "video_grounding_temporal", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "chess_sygyzy_endgames", + "score": 0.07551020408163266, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_visual_blocksworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "web_action_grounding", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "position_relationship", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Web_Multi", + "score": 0.39285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "webpage_code_understanding", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "vln_english_next_step", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "logical_reasoning_2d_folding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "extract_webpage_headline", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_translation_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "music_sheet_note_count", + "score": 0.11764705882352941, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_circled_letter", + "score": 0.4642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_find_odd_one_out", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "paper_vqa", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "worldle", + "score": 0.26066139706079494, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "medical_keywords_based_retrieval_non_radiology", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "remaining_playback_time_calculation", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "sign_language", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "video_camera_motion_description", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "exact_text", + "num_input": "video" + }, + { + "name": "video_segments_reordering", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "structured_output", + "num_input": "video" + }, + { + "name": "interpret_force_perspective_illusion", + "score": 0.8666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Web_Single", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "hashtag_recommendation", + "score": 0.7809523809523808, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_tyreworld", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_grounding_spatial", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "weather_map_climate_type_temperature_parsing", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_match_problem", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "music_sheet_sentiment", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "scibench_calculus_wo_solution", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_eval_visual_pref", + "score": 0.625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "image_translation_en2cn", + "score": 0.4093154979121689, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "game_platform_support_identification", + "score": 0.8214285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "music_sheet_format_QA", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "CLEVRER_physics", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "location_vqa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "semantic_matching_of_two_images", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "planning_visual_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "geometry_reasoning_nested_squares", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "circuit_diagram_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "scibench_fundamental_wo_solution", + "score": 0.12244897959183673, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_visual_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "planning_visual_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "code_retrieval", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "planning_screenshot_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_blocksworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_storage", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "distinguish_ai_generated_image", + "score": 0.631578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_count_line_intersections", + "score": 0.39285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_grippers", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "booking_web_recommendation", + "score": 0.48435374149659866, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "photoshop_operation", + "score": 0.22976190476190478, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "llavaguard", + "score": 0.4642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "media_homepage_profile", + "score": 0.11518481518481517, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "vln_hindi_next_step", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "media_recommend_solutions_stackoverflow", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "dish_ingredient_match", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "google_streetview_direction_understanding", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "music_sheet_name", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "code_translation_easy", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "ancient_map_understanding", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "mindmap_elements_parsing", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "autonomous_driving_scene_analysis", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_2D_views_of_3D_shapes", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "paper_review_acceptance", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "rocks_samples_compare", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "google_streetview_line_reasoning", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "orchestra_score_recognition", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "highest_discount_game_price_identification", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "video_intent_recognition", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "counting", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "media_QA_web_stackoverflow", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_solution_compare", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "soccer_offside", + "score": 0.1111111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "2d_image_jigsaw_puzzle_easy", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "medical_content_based_retrieval_radiology", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "song_title_identification_from_lyrics", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MMMU_pro_exam_screenshot", + "score": 0.1717171717171717, + "eval_type": "rule", + "num_demo": 1, + "num_query": 99, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "multiview_reasoning_camera_moving", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "medical_multi_organ_segmentation_rater", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "knowledge_graph_understanding", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "rocks_samples_identify", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "mensa_iq_test", + "score": 0.47892156862745094, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "play_go_capture_stone", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "booking_web_rating", + "score": 0.32142857142857145, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "web_action_prediction", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "top_video_creator_identification", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_visual_grippers", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "annoying_word_search", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Mobile_tap", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "comic_page_ordering", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "geometry_reasoning_grid", + "score": 0.39285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "TRANCE_physics_reasoning_event", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "planning_screenshot_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Mobile_swipe", + "score": 0.5792494908838608, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "recover_masked_word_in_figure", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "vln_identify_robot", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "9-image or more" + }, + { + "name": "multilingual_news_qa", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "move_pos_to_pos_hanoi_4_pole", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "relative_reflectance_of_different_regions", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "counting_multi_image", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "code_translation_advanced", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "signage_navigation", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "transit_map_intersection_points", + "score": 0.38690476190476186, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_execution", + "score": 0.1875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "icon_arithmetic_puzzle", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_overlapped_circle", + "score": 0.4642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "google_streetview_circle_sorting", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "chinese_idiom_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "knowledge_sign_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "monthly_weather_days_count", + "score": 0.4523809523809523, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "calendar_schedule_suggestion", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "music_sheet_author", + "score": 0.1875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "ascii_art_understanding", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "chess_find_legal_moves", + "score": 0.027035301548183985, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "topological_sort", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "entertainment_web_game_style", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_action_recognition", + "score": 0.8214285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "structured_output", + "num_input": "video" + }, + { + "name": "functionality_matching_in_different_objects", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "Movie_retrieval_by_actor", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "TV_show_retrieval_by_character", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "Forensic_Detection_of_different_images", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "polygon_interior_angles", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "pokemon_3D_recognition", + "score": 0.4166666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "mahjong", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "font_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "paper_review_rating", + "score": 0.6370339174257883, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "numerical_data", + "num_input": "4-5 images" + }, + { + "name": "number_comparison", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "actor_recognition_in_Movie", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "TRANCE_physics_reasoning_basic", + "score": 0.4117647058823529, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "waldo", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "google_streetview_line_sorting", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "video_eval_dynamic_pref", + "score": 0.375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "product_ocr_qa", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "Bongard_Problem", + "score": 0.3684210526315789, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_translation_Python", + "score": 0.10416666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "logical_reasoning_fit_pattern", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "road_map_find_highway_between_two_place", + "score": 0.7647058823529411, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ishihara_test", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_parasite_detection", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "LaTeX_complex_formula_convertion", + "score": 0.35294117647058826, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "vln_identify_location", + "score": 0.20606060606060606, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "rebus", + "score": 0.21739130434782608, + "eval_type": "rule", + "num_demo": 1, + "num_query": 23, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "constrained_generation_multi_contain_repeat_position_only_length", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_multi_contain_repeat", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_position_length", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_xor_images", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_position_images", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_repeat_length", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_contain_length", + "score": 0.8, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_multi_contain_position_only", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_contain_images", + "score": 0.8666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_length", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_equality", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_crushing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_checkmate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "panel_images_single_question", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "panel_images_multi_question", + "score": 0.7857142857142856, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_tiktok", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_notes", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_alipay", + "score": 0.17647058823529413, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_amazon", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_instagram", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_ppt", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_leetcode", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_twitter", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_zoom", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_iphone_settings", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_youtube", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_excel", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_word", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "crossword_mini_5x5", + "score": 0.15714285714285717, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MMSoc_HatefulMemes", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_GossipCop", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Object Recognition and Classification", + "Scene and Event Understanding", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "medical_abdomen_MRI_organ_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "medical_abdomen_endscopy_organ_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "medical_blood_vessels_recognition", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_cell_recognition", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "medical_retrieval_given_surgeon_activity", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Videos", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "medical_image_artifacts_indentification", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "medical_counting_lymphocytes", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_PolitiFact", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Memotion", + "score": 0.5764705882352942, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "reward_models_I2T_reward", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "reward_models_T2I_reward", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "ocr_math_MATH", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "ocr_math_TheoremQA", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "counterfactual_arithmetic", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_homepage", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_publication", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_blogpost", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "table_understanding_complex_question_answering", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "table_understanding_fact_verification", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_swap", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_style", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_clip_stable_diffusion_generate", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_veracity", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_attribute_edit", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_entity_replace", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_out_of_context", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_background_change", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "video_motion_matching_3D_real", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "video_motion_matching_real_3D", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "number_puzzle_kakuro_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "number_puzzle_sudoku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ball_cup_swap_3", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "maze_2d_8x8", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "shape_composition_shapes", + "score": 0.3096938775510204, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "shape_composition_colours", + "score": 0.37069160997732437, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_chinese_celebrity", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_indian_celebrity", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_papers", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_famous_treaty", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_csv", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_school_plain", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_text_latex", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_skill_plain", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_experience_plain", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_article_authors", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_markdown", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_html", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_employer_plain", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_equation", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_article_journal", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_latex", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "autorater_subject", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_artifact", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "autorater_control", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_aesthetics", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_mask", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_artifact_reason", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "autorater_semantics", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_motion_guided_editing", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "autorater_3d_model_texturing", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Metrics", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_unmask", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "poetry_haiku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_limerick", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_petrarchian_sonnet_optional_meter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_custom_rhyming_scheme", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic_alliteration", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_shakespearean_sonnet", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "pictionary_cartoon_drawing_guess", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_doodle_guess", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_skribbl_io", + "score": 0.05, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_genai_output_chinese", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_chinese_food_img2en", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_tiktok", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_alipay", + "score": 0.4117647058823529, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_amazon", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_instagram", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_ppt", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_leetcode", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_twitter", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_zoom", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_iphone_settings", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_youtube", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_excel", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_word", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "tqa_textbook_qa", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_output_result", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "kvqa_knowledge_aware_qa", + "score": 0.5789473684210527, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "cultural_vqa", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "character_recognition_in_TV_shows", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "3d_indoor_scene_text_bbox_selection", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "table_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "handwritten_math_expression_extraction", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MMMU_physics_chemistry_MCQ", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_add_tag", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "realworld_qa_en2cn", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "arxiv_vqa", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "painting_QA", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "app_layout_understanding_notes", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "scibench_w_solution_open_ended", + "score": 0.15999999999999998, + "eval_type": "llm", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "electrocardiogram", + "score": 0.38571428571428573, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funny_image_title", + "score": 0.6142857142857144, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "image_captioning_with_additional_requirements", + "score": 0.8928571428571431, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "meme_explain", + "score": 0.6857142857142858, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "image_humor_understanding", + "score": 0.7758620689655171, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "doc_vqa", + "score": 0.6749999999999999, + "eval_type": "llm", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "4-5 images" + }, + { + "name": "generated_video_artifacts", + "score": 0.25625000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "sceneqa_scene_transition_video", + "score": 0.24285714285714285, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "nextqa_oe", + "score": 0.31052631578947365, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "tweets_captioning", + "score": 0.42857142857142855, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "defeasible_reasoning", + "score": 0.789655172413793, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "vibe-eval", + "score": 0.5428571428571429, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "ocrqa", + "score": 0.7724137931034479, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_content_follow_up", + "score": 0.8285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Planning and Decision Making" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "figurative_speech_explanation", + "score": 0.793103448275862, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "wikihow_complex_task_completion", + "score": 0.6222222222222222, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "9-image or more" + }, + { + "name": "docci_image_description_long", + "score": 0.7285714285714285, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_detail_description", + "score": 0.20526315789473687, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "video2notes", + "score": 0.4928571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "funqa_unexpected_action_magic_video", + "score": 0.36, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "video_qa", + "score": 0.7928571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "bar_chart_interpretation", + "score": 0.5586206896551723, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_summary", + "score": 0.5642857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "iq_test", + "score": 0.4034482758620689, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funqa_unexpected_action_creative_video", + "score": 0.21333333333333335, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "unusual_images", + "score": 0.7689655172413793, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "GUI_Chat_Hard", + "score": 0.36451612903225805, + "eval_type": "llm", + "num_demo": 1, + "num_query": 31, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "graph_interpretation", + "score": 0.7551724137931034, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "traffic_accident_analysis", + "score": 0.5714285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "humor_explanation", + "score": 0.6266666666666667, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funqa_unexpected_action_humor_video", + "score": 0.18000000000000005, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "video_short_title", + "score": 0.6071428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "GUI_Chat_Easy", + "score": 0.6423076923076925, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "table2latex_complex", + "score": 0.711111111111111, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "activitynetqa", + "score": 0.41578947368421054, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "visualization_with_code", + "score": 0.31428571428571433, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "science_figure_explanation", + "score": 0.7724137931034484, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "guess_image_generation_prompt", + "score": 0.8421052631578947, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "art_explanation", + "score": 0.6000000000000001, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "paper_review_writing", + "score": 0.45999999999999996, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "4-5 images" + }, + { + "name": "bridge_strategies_worldclass", + "score": 0.2571428571428572, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_expert", + "score": 0.27142857142857146, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_advanced", + "score": 0.15, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Spanish", + "score": 0.5071428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Arabic", + "score": 0.46428571428571425, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Japanese", + "score": 0.3785714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Russian", + "score": 0.35714285714285715, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_English", + "score": 0.5214285714285715, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_French", + "score": 0.42142857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "table_understanding_fetaqa", + "score": 0.42857142857142866, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visual_order_B", + "score": 0.905263157894737, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "red_teaming_racial", + "score": 0.6850000000000002, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_captcha", + "score": 0.10000000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_celebrity", + "score": 0.8400000000000002, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visual_order_A", + "score": 0.905263157894737, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "red_teaming_politics", + "score": 0.7200000000000001, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visualmisleading", + "score": 0.8842105263157897, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_jailbreak", + "score": 0.6800000000000002, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Text Recognition (OCR)", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Spanish", + "score": 0.2857142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Arabic", + "score": 0.2571428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Russian", + "score": 0.29285714285714287, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_French", + "score": 0.3071428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Chinese", + "score": 0.33571428571428574, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "ascii_art_30", + "score": 0.07142857142857142, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + } +] \ No newline at end of file diff --git a/static/eval_results/Default/InternVL2_8B/summary_results.json b/static/eval_results/Default/InternVL2_8B/summary_results.json new file mode 100644 index 0000000000000000000000000000000000000000..6d91119c9457eff407742939ac2882586060f469 --- /dev/null +++ b/static/eval_results/Default/InternVL2_8B/summary_results.json @@ -0,0 +1,251 @@ +{ + "model_summary": { + "core": { + "num_eval_tasks": 440, + "num_eval_samples": 6539, + "macro_mean_score": 0.25956581776451815, + "micro_mean_score": 0.2546984460483302 + }, + "open": { + "num_eval_tasks": 65, + "num_eval_samples": 1165, + "macro_mean_score": 0.3978571701460552, + "micro_mean_score": 0.4108583690987125 + }, + "overall_score": 0.2773656948037259 + }, + "keyword_stats": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.2817247716997634 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.280559214034858 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2511, + "tasks": [], + "average_score": 0.32020728060179815 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2469, + "tasks": [], + "average_score": 0.325593535916075 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.24118253695139918 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.39684007367798446 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.4700852130325815 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.27052668526005397 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2439, + "tasks": [], + "average_score": 0.23189345356483618 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.08260405712900723 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.22800928556370195 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.2013779290163996 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.2804429603269583 + }, + "Videos": { + "count": 43, + "num_samples": 700, + "tasks": [], + "average_score": 0.34791358240562653 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.2942163420306113 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.3388056726588417 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.10933317885944857 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.250804626773504 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.2522493284864019 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.27414636444623874 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.22381302045502052 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1456, + "tasks": [], + "average_score": 0.3537549824897016 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.30261189962428353 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.15434618291761149 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.19872104324302098 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.30088711082969344 + }, + "video": { + "count": 43, + "num_samples": 700, + "tasks": [], + "average_score": 0.34791358240562653 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.17725087609332119 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.2532272454839157 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.29129840423784176 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.12166926715781588 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.24700310231619527 + }, + "Perception": { + "count": 145, + "num_samples": 2315, + "tasks": [], + "average_score": 0.3214666523378005 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.3995660275981844 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.24614711281861912 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.3393895915929317 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.22078333222564453 + } + } + } +} \ No newline at end of file diff --git a/static/eval_results/Default/InternVL2_8B/task_results.json b/static/eval_results/Default/InternVL2_8B/task_results.json new file mode 100644 index 0000000000000000000000000000000000000000..a948f5adb69fc399f963cb90ab13f6eac750b8c4 --- /dev/null +++ b/static/eval_results/Default/InternVL2_8B/task_results.json @@ -0,0 +1,7756 @@ +[ + { + "name": "functionality_matching_in_different_objects", + "score": 0.17857142857142858, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "paper_review_rating", + "score": 0.7270348376779725, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "numerical_data", + "num_input": "4-5 images" + }, + { + "name": "scibench_calculus_wo_solution", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_find_odd_one_out", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "google_streetview_circle_sorting", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "planning_screenshot_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "relative_depth_of_different_points", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "rocks_samples_identify", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "sign_language", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "2d_image_jigsaw_puzzle_easy", + "score": 0.20595238095238097, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "booking_web_recommendation", + "score": 0.28279478458049884, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "signage_navigation", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "road_map_find_highway_between_two_place", + "score": 0.29411764705882354, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_fit_pattern", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "medical_keywords_based_retrieval_non_radiology", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "entertainment_web_game_style", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "game_platform_support_identification", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ascii_art_understanding", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "scibench_fundamental_wo_solution", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "vln_hindi_next_step", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "GUI_Act_Mobile_swipe", + "score": 0.6019887092978411, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_grounding_temporal", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "extract_webpage_headline", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "distinguish_ai_generated_image", + "score": 0.5263157894736842, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_nested_squares", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "booking_web_rating", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "medical_parasite_detection", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "planning_screenshot_tyreworld", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_overlapped_circle", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "monthly_weather_days_count", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_visualization_output_understanding", + "score": 0.1, + "eval_type": "rule", + "num_demo": 1, + "num_query": 10, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "CLEVRER_physics", + "score": 0.15, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Mobile_tap", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "circuit_diagram_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "music_sheet_note_count", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "chess_find_legal_moves", + "score": 0.035313785427807685, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_translation_advanced", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "planning_visual_storage", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "multiview_reasoning_camera_moving", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "planning_visual_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "code_execution", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_sygyzy_endgames", + "score": 0.042857142857142864, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "video_action_recognition", + "score": 0.8214285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "structured_output", + "num_input": "video" + }, + { + "name": "music_sheet_author", + "score": 0.3125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "rocks_samples_compare", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "comic_page_ordering", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "hashtag_recommendation", + "score": 0.8821428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "multilingual_news_qa", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "paper_review_acceptance", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "counting_multi_image", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "photoshop_operation", + "score": 0.19642857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "vln_identify_robot", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "9-image or more" + }, + { + "name": "geometry_reasoning_grid", + "score": 0.10714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_visual_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "annoying_word_search", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "healthcare_info_judgement", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Ethical and Safety Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "webpage_code_understanding", + "score": 0.7777777777777778, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "font_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "relative_reflectance_of_different_regions", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "vln_tegulu_next_step", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "calendar_schedule_suggestion", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "mensa_iq_test", + "score": 0.29362745098039217, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "google_streetview_direction_understanding", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "planning_visual_blocksworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "medical_multi_organ_segmentation_rater", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "product_ocr_qa", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "counting", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_blocksworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "semantic_matching_of_two_images", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "MMMU_pro_exam_screenshot", + "score": 0.1414141414141414, + "eval_type": "rule", + "num_demo": 1, + "num_query": 99, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "chinese_idiom_recognition", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_visual_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "ishihara_test", + "score": 0.34285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "pokemon_3D_recognition", + "score": 0.11666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "Ad_count_detection", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "visual_correspondance_in_two_images", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "number_comparison", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "code_translation_Python", + "score": 0.08333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "song_title_identification_from_lyrics", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "paper_vqa", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "TRANCE_physics_reasoning_event", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "play_go_capture_stone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "music_sheet_name", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "medical_content_based_retrieval_radiology", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "mindmap_elements_parsing", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_segments_reordering", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "structured_output", + "num_input": "video" + }, + { + "name": "planning_screenshot_storage", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "rebus", + "score": 0.08695652173913043, + "eval_type": "rule", + "num_demo": 1, + "num_query": 23, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "music_sheet_format_QA", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "transit_map_intersection_points", + "score": 0.007711038961038961, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "Movie_retrieval_by_actor", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "music_sheet_sentiment", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "code_retrieval", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "video_intent_recognition", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "GUI_Act_Web_Single", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "image_translation_en2cn", + "score": 0.3204993605915246, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "actor_recognition_in_Movie", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "remaining_playback_time_calculation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ancient_map_understanding", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "media_recommend_solutions_stackoverflow", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "vln_english_next_step", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "waldo", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "medical_polyp_segmentation_single_object_rater", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "planning_screenshot_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "soccer_offside", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "code_match_problem", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "recover_masked_word_in_figure", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "top_video_creator_identification", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "web_action_grounding", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "code_solution_compare", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "interpret_force_perspective_illusion", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "video_grounding_spatial", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "logical_reasoning_2d_folding", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "GUI_Act_Web_Multi", + "score": 0.39285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "llavaguard", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "google_streetview_circle_reasoning", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "TV_show_retrieval_by_character", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "icon_arithmetic_puzzle", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "vln_identify_location", + "score": 0.0606060606060606, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "planning_visual_grippers", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "geometry_reasoning_count_line_intersections", + "score": 0.32142857142857145, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "web_action_prediction", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "knowledge_graph_understanding", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "knowledge_sign_recognition", + "score": 0.1111111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "Forensic_Detection_of_different_images", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "position_relationship", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "Bongard_Problem", + "score": 0.13157894736842105, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_circled_letter", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_2D_views_of_3D_shapes", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "dish_ingredient_match", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "media_homepage_profile", + "score": 0.023809523809523808, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "video_camera_motion_description", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "exact_text", + "num_input": "video" + }, + { + "name": "google_streetview_line_sorting", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "move_pos_to_pos_hanoi_4_pole", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "flowchart_code_generation", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "topological_sort", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "google_streetview_line_reasoning", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "TRANCE_physics_reasoning_view", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "planning_screenshot_grippers", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "TRANCE_physics_reasoning_basic", + "score": 0.058823529411764705, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "weather_map_climate_type_temperature_parsing", + "score": 0.4642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "highest_discount_game_price_identification", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "media_QA_web_stackoverflow", + "score": 0.642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "newspaper_ocr_in_query_box", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "electricity_plot_future_prediction", + "score": 0.38205263157894737, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "coco_person_detection", + "score": 0.043576951479015184, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geometry_length", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "weather_info_parsing", + "score": 0.6865079365079365, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "av_view_identification", + "score": 0.04444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "physical_property_reasoning", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "face_keypoint_detection", + "score": 0.848854419078294, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_maxflow", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "av_vehicle_multiview_counting", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "6-8 images" + }, + { + "name": "geometry_transformation", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "chess_puzzle_single_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geometry_analytic", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "mvsa_sentiment_classification", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "graph_theory", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "clevrer_video_moving_object_property_recognition", + "score": 0.6875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "clevrer_video_moving_object_count", + "score": 0.19047619047619047, + "eval_type": "rule", + "num_demo": 1, + "num_query": 21, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "video" + }, + { + "name": "sta_action_localization_video", + "score": 0.375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "graph_hamiltonian_cycle", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "vizwiz_quality_accessment_for_blind", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "funsd_document_qa", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "single_person_pose_estimation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "deciphering_oracle_bone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "license_plate_recognition", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "science_molecule_chemistry", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "snli_ve_visual_entailment", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geometry_solid", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "music_info_parsing", + "score": 0.3392857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_area", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "landmark_check_two_images", + "score": 0.08888888888888888, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "dvqa", + "score": 0.7368421052631579, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "cheapest_flight_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "code_programming_extremely_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Planning and Decision Making" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "widerface_face_count_and_event_classification", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "movie_info_retrieval", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "long_string_letter_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chess_winner_identification", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "math_convexity_value_estimation", + "score": 0.44950911131594296, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_math", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "human_relationship_reasoning", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Commonsense and Social Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "vibe_eval_short_phrase", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "av_multicamera_tracking_predict_bbox", + "score": 0.0320582378164677, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "9-image or more" + }, + { + "name": "perception_test_video_character_order", + "score": 0.4375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "question_solution_solving", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "game_info_parsing", + "score": 0.7272727272727273, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "top_rated_hotel_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "nextqa_mc", + "score": 0.8421052631578947, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "algebra", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "graph_shortest_path_planar", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "action_sequence", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "9-image or more" + }, + { + "name": "map_diagram_qa", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "places365_similar_scene_retrieval", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "graph_shortest_path_kamada_kawai", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "star_object_interaction_video", + "score": 0.3125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "graph_chordless_cycle", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "iconqa", + "score": 0.15789473684210525, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "graph_connectivity", + "score": 0.21666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "mnist_pattern", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "clevrer_moving_direction_video", + "score": 0.125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "egocentric_analysis_single_image", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "image_style_recognition", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "weather_info_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "geometry_descriptive", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "IAM_line_ocr_and_locate", + "score": 0.46903875551438506, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "insect_order_classification", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "perception_test_object_shuffle_video", + "score": 0.375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "muma_theory_of_mind_social_goal", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "ti_fused_vqa_biology", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "youtube_video_info_parsing", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "muma_theory_of_mind_belief_of_goal", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "quizlet_question_solving", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_chemistry", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "clevr_arithmetic", + "score": 0.47368421052631576, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "multilingual_movie_info_parsing", + "score": 0.32653061224489793, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "hotel_booking_confirmation_parsing", + "score": 0.39999999999999997, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_programming_test_easy", + "score": 0.08333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 24, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "brand_logo_recognition_and_elaboration", + "score": 0.54, + "eval_type": "rule", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "arc_agi", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "movie_info_parsing", + "score": 0.41964285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "3d_indoor_scene_text_bbox_prediction", + "score": 0.027174771375503042, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "video_to_camera_trajectory_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "TV_show_info_parsing", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "stock_info_parsing", + "score": 0.7394957983193277, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "math_parity", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "visualdial_visual_dialog_image_guessing", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "famous_building_recognition", + "score": 0.75, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "egocentric_spatial_reasoning", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "nlvr2_two_image_compare_qa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "vlnqa_egocentric_navigation_video", + "score": 0.1875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "graph_isomorphism", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "landmark_recognition_and_qa", + "score": 0.35555555555555546, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "face_identity_matching", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "4-5 images" + }, + { + "name": "music_info_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "clevrer_object_existence_video", + "score": 0.75, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "cam_traj_to_video_selection", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "video_content_reasoning", + "score": 0.7777777777777778, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "3d_fragments_understanding", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "ti_fused_vqa_physics", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "pmc_vqa_medical_image_qa", + "score": 0.5789473684210527, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "long_string_number_recognition", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "electricity_future_prediction_from_table", + "score": 0.44052631578947365, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "places365_scene_type_classification", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_error_line_identification", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "code_programming_test_advanced", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "super_clevr", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "animal_pose_estimation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "logo2k_same_type_logo_retrieval", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "science_basic_physics", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "math_breakpoint", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "newspaper_page_parse_and_count", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "multilingual_game_info_parsing", + "score": 0.4375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chart_vqa", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "average_humidity_estimate_plot", + "score": 0.5533333333333332, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "photo_sharing_image_retrieval", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "humor_understand_caption_match", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "game_info_retrieval", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "emotion_recognition", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "6-8 images" + }, + { + "name": "figureqa", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "av_human_multiview_counting", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "6-8 images" + }, + { + "name": "perception_test_video_action_count", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "video" + }, + { + "name": "stock_info_retrieval", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "coco_object_detection_by_query_property", + "score": 0.31442052224278927, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "graph_hamiltonian_path", + "score": 0.3928571428571428, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "coco_ood_global_image_retrieval_by_query_property", + "score": 0.5184240362811792, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "traffic_future_prediction_from_line_plot", + "score": 0.5382631578947369, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_crushing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_equality", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_checkmate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_papers", + "score": 0.1, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_indian_celebrity", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_chinese_celebrity", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_famous_treaty", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "reward_models_T2I_reward", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "reward_models_I2T_reward", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "crossword_mini_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_motion_matching_real_3D", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "video_motion_matching_3D_real", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "number_puzzle_sudoku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "number_puzzle_kakuro_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "panel_images_single_question", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "panel_images_multi_question", + "score": 0.40476190476190477, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "table_understanding_complex_question_answering", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "table_understanding_fact_verification", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_position_images", + "score": 0.4, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_xor_images", + "score": 0.4, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_repeat_length", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_contain_length", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_multi_contain_position_only", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_length", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_contain_images", + "score": 0.8, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_multi_contain_repeat", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_position_length", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_multi_contain_repeat_position_only_length", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "maze_2d_8x8", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_PolitiFact", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_GossipCop", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Object Recognition and Classification", + "Scene and Event Understanding", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_HatefulMemes", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Memotion", + "score": 0.3058823529411766, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_latex", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_csv", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_markdown", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_experience_plain", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_employer_plain", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_article_authors", + "score": 0.5357142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_article_journal", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_skill_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_text_latex", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_equation", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_html", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_school_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "pictionary_doodle_guess", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_chinese_food_img2en", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_cartoon_drawing_guess", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_genai_output_chinese", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_skribbl_io", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "counterfactual_arithmetic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_youtube", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_iphone_settings", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_notes", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_word", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_alipay", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_twitter", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_leetcode", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_tiktok", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_excel", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_instagram", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_ppt", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_zoom", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_amazon", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_layout_understanding_youtube", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_iphone_settings", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_word", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_alipay", + "score": 0.058823529411764705, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_twitter", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_leetcode", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_tiktok", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_excel", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_instagram", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_ppt", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_zoom", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_amazon", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_attribute_edit", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_out_of_context", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_background_change", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_entity_replace", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_clip_stable_diffusion_generate", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_style", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_veracity", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_swap", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "autorater_3d_model_texturing", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Metrics", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_subject", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_unmask", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_artifact_reason", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "autorater_mask", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_aesthetics", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_motion_guided_editing", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "autorater_artifact", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "autorater_control", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_semantics", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "shape_composition_shapes", + "score": 0.11760204081632653, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "shape_composition_colours", + "score": 0.15759637188208617, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_math_TheoremQA", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_MATH", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "ball_cup_swap_3", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "poetry_petrarchian_sonnet_optional_meter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_shakespearean_sonnet", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_custom_rhyming_scheme", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_haiku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic_alliteration", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_limerick", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "geographic_remote_sensing_land_cover", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "visual_prediction_rater_novel_view_synthesis", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_semantic_segmentation", + "score": 0.22916666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_panoptic_segmentation", + "score": 0.023809523809523808, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_surface_normal_estimation", + "score": 0.023809523809523808, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_openable_part_segmentation", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_plane_segmentation", + "score": 0.02222222222222222, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_depth_estimation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_3d_assembled_quality_understanding", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "cvbench_adapted_cvbench_depth", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_relation", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_distance", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_count", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_africa", + "score": 0.014285714285714287, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_americas", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_europe", + "score": 0.05714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "location_vqa", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_translation_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "code_translation_easy", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "mahjong", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "orchestra_score_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_programming_test_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "waybill_number_sequence_extraction", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "signboard_identification", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "symbolic_graphics_programs_computer_aided_design", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "symbolic_graphics_programs_scalable_vector_graphics", + "score": 0.05555555555555555, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "polygon_interior_angles", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "worldle", + "score": 0.09999999999999999, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "ili_ratio_future_prediction", + "score": 0.018357142857142832, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "multi_load_type_prediction_from_plot", + "score": 0.36904761904761907, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "stock_price_future_prediction", + "score": 0.6153571428571428, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "electricity_load_estimate_plot", + "score": 0.5148571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "exchange_rate_estimate_plot", + "score": 0.9305714285714287, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_blogpost", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_homepage", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_publication", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "recipe_image_ordering", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "LaTeX_complex_formula_convertion", + "score": 0.47058823529411764, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_asia", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "video_eval_dynamic_pref", + "score": 0.375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "video_eval_visual_pref", + "score": 0.5625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "video_eval_factual_pref", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "medical_cell_recognition", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "medical_image_artifacts_indentification", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "medical_counting_lymphocytes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "medical_abdomen_endscopy_organ_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "medical_blood_vessels_recognition", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "autonomous_driving_scene_analysis", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "medical_abdomen_MRI_organ_recognition", + "score": 0.09523809523809523, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "medical_retrieval_given_surgeon_activity", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Videos", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "action_prediction", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "cultural_vqa", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "tqa_textbook_qa", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_output_result", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "kvqa_knowledge_aware_qa", + "score": 0.42105263157894735, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "character_recognition_in_TV_shows", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "3d_indoor_scene_text_bbox_selection", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "table_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "handwritten_math_expression_extraction", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MMMU_physics_chemistry_MCQ", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_add_tag", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "realworld_qa_en2cn", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "arxiv_vqa", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "painting_QA", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "app_layout_understanding_notes", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ascii_art_30", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "docci_image_description_long", + "score": 0.5357142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bar_chart_interpretation", + "score": 0.33103448275862074, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funqa_unexpected_action_magic_video", + "score": 0.40666666666666657, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "graph_interpretation", + "score": 0.4620689655172414, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "guess_image_generation_prompt", + "score": 0.8, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "image_humor_understanding", + "score": 0.39310344827586213, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "science_figure_explanation", + "score": 0.3448275862068966, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_detail_description", + "score": 0.28421052631578947, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "humor_explanation", + "score": 0.45333333333333325, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video2notes", + "score": 0.39999999999999997, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "activitynetqa", + "score": 0.431578947368421, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "figurative_speech_explanation", + "score": 0.4034482758620689, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "meme_explain", + "score": 0.55, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_qa", + "score": 0.6928571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "art_explanation", + "score": 0.21724137931034476, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "unusual_images", + "score": 0.403448275862069, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "image_captioning_with_additional_requirements", + "score": 0.8285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "tweets_captioning", + "score": 0.35, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "wikihow_complex_task_completion", + "score": 0.3333333333333333, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "9-image or more" + }, + { + "name": "defeasible_reasoning", + "score": 0.4793103448275863, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "electrocardiogram", + "score": 0.2785714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "doc_vqa", + "score": 0.7875, + "eval_type": "llm", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "4-5 images" + }, + { + "name": "funny_image_title", + "score": 0.5857142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "GUI_Chat_Easy", + "score": 0.6653846153846155, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "iq_test", + "score": 0.1896551724137931, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "scibench_w_solution_open_ended", + "score": 0.206, + "eval_type": "llm", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funqa_unexpected_action_creative_video", + "score": 0.17333333333333334, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "video_summary", + "score": 0.37142857142857144, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "visualization_with_code", + "score": 0.07142857142857142, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocrqa", + "score": 0.613793103448276, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funqa_unexpected_action_humor_video", + "score": 0.20000000000000007, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "paper_review_writing", + "score": 0.5, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "4-5 images" + }, + { + "name": "GUI_Chat_Hard", + "score": 0.6032258064516127, + "eval_type": "llm", + "num_demo": 1, + "num_query": 31, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "traffic_accident_analysis", + "score": 0.2357142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "vibe-eval", + "score": 0.40714285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "generated_video_artifacts", + "score": 0.31250000000000006, + "eval_type": "llm", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "table2latex_complex", + "score": 0.6222222222222222, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_short_title", + "score": 0.5071428571428572, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "sceneqa_scene_transition_video", + "score": 0.2, + "eval_type": "llm", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "nextqa_oe", + "score": 0.26842105263157895, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "video_content_follow_up", + "score": 0.4714285714285715, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Planning and Decision Making" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "red_teaming_celebrity", + "score": 0.8549999999999999, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visual_order_B", + "score": 0.905263157894737, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "red_teaming_visualmisleading", + "score": 0.8684210526315791, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_racial", + "score": 0.66, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visual_order_A", + "score": 0.8421052631578949, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "red_teaming_politics", + "score": 0.6199999999999999, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_captcha", + "score": 0.10000000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_jailbreak", + "score": 0.6100000000000001, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Text Recognition (OCR)", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Russian", + "score": 0.1285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_French", + "score": 0.34285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Spanish", + "score": 0.3857142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_English", + "score": 0.4428571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Japanese", + "score": 0.14285714285714288, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Arabic", + "score": 0.06428571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Russian", + "score": 0.14285714285714285, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_French", + "score": 0.15714285714285717, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Spanish", + "score": 0.12142857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Chinese", + "score": 0.1142857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Arabic", + "score": 0.04285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "table_understanding_fetaqa", + "score": 0.45000000000000007, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_worldclass", + "score": 0.09285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_expert", + "score": 0.4, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_advanced", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + } +] \ No newline at end of file diff --git a/static/eval_results/Default/Llama_3_2_11B/summary_results.json b/static/eval_results/Default/Llama_3_2_11B/summary_results.json new file mode 100644 index 0000000000000000000000000000000000000000..b9e128e5c619e8d90b92df12a38760d4d8f440b2 --- /dev/null +++ b/static/eval_results/Default/Llama_3_2_11B/summary_results.json @@ -0,0 +1,251 @@ +{ + "model_summary": { + "core": { + "num_eval_tasks": 440, + "num_eval_samples": 6539, + "macro_mean_score": 0.15999641916771298, + "micro_mean_score": 0.15809331016967038 + }, + "open": { + "num_eval_tasks": 65, + "num_eval_samples": 1163, + "macro_mean_score": 0.3173342406187366, + "micro_mean_score": 0.3487962166809973 + }, + "overall_score": 0.1802478219287358 + }, + "keyword_stats": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.1907604552173455 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.14328677752263275 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.19646404502647707 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.22399113135844315 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.13303760019716085 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.323153603297999 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.4260501253132832 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.1770852858056774 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.15366454315378308 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.06563884729522687 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.11886347847341794 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.11489351406848371 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.1693681214060816 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.2123769209846321 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.2520175802062012 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.2485354956932213 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.06418655520777307 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.12417283740525839 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.16374180545556977 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.1576236804437753 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.15014439824913947 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.3003142292328822 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.19270157739425633 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.1463246409674981 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.0732004839476103 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.1960107191983825 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.2123769209846321 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.1351857051327849 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.18586695387250338 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.17288724679416761 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.08100042975820579 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.0575426944971537 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.19899465185565898 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.254316961351997 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.162801811963855 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.28055776664538923 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.13937853323074623 + } + } + } +} \ No newline at end of file diff --git a/static/eval_results/Default/Llama_3_2_11B/task_results.json b/static/eval_results/Default/Llama_3_2_11B/task_results.json new file mode 100644 index 0000000000000000000000000000000000000000..54bb871b50f68090ff56c48c2980c262ea44e0e0 --- /dev/null +++ b/static/eval_results/Default/Llama_3_2_11B/task_results.json @@ -0,0 +1,7756 @@ +[ + { + "name": "face_identity_matching", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "4-5 images" + }, + { + "name": "IAM_line_ocr_and_locate", + "score": 0.20517362180506796, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "science_molecule_chemistry", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "weather_info_retrieval", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "signboard_identification", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "av_multicamera_tracking_predict_bbox", + "score": 0.0320582378164677, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "9-image or more" + }, + { + "name": "funsd_document_qa", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "physical_property_reasoning", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "geometry_area", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geometry_analytic", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "map_diagram_qa", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "clevr_arithmetic", + "score": 0.21052631578947367, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "graph_connectivity", + "score": 0.21666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "visualdial_visual_dialog_image_guessing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "graph_isomorphism", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "clevrer_object_existence_video", + "score": 0.5625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "question_solution_solving", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "snli_ve_visual_entailment", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_chemistry", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ili_ratio_future_prediction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "image_style_recognition", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "multilingual_movie_info_parsing", + "score": 0.18367346938775506, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "star_object_interaction_video", + "score": 0.4375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "video_content_reasoning", + "score": 0.1111111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "geometry_transformation", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "places365_similar_scene_retrieval", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "vibe_eval_short_phrase", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "cam_traj_to_video_selection", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "math_breakpoint", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "photo_sharing_image_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "perception_test_video_character_order", + "score": 0.0625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "coco_ood_global_image_retrieval_by_query_property", + "score": 0.23051948051948054, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "arc_agi", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "vizwiz_quality_accessment_for_blind", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "av_human_multiview_counting", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "6-8 images" + }, + { + "name": "waybill_number_sequence_extraction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "emotion_recognition", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "6-8 images" + }, + { + "name": "brand_logo_recognition_and_elaboration", + "score": 0.46, + "eval_type": "rule", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "license_plate_recognition", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chess_puzzle_single_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_programming_test_easy", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 24, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_winner_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chart_vqa", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "math_convexity_value_estimation", + "score": 0.13514770116170818, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "3d_indoor_scene_text_bbox_prediction", + "score": 0.009133339778501075, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "long_string_letter_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "dvqa", + "score": 0.42105263157894735, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "egocentric_spatial_reasoning", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "music_info_retrieval", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "quizlet_question_solving", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "coco_person_detection", + "score": 0.3577885451105633, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "clevrer_moving_direction_video", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "graph_maxflow", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "game_info_parsing", + "score": 0.7012987012987012, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_hamiltonian_cycle", + "score": 0.33035714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "av_vehicle_multiview_counting", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "6-8 images" + }, + { + "name": "animal_pose_estimation", + "score": 0.056101443718362946, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "humor_understand_caption_match", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "super_clevr", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "average_humidity_estimate_plot", + "score": 0.05999999999999999, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "nlvr2_two_image_compare_qa", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "sta_action_localization_video", + "score": 0.1875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "face_keypoint_detection", + "score": 0.056296198118786375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "insect_order_classification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "deciphering_oracle_bone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "places365_scene_type_classification", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "science_basic_physics", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "perception_test_object_shuffle_video", + "score": 0.0625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "long_string_number_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_programming_test_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "video_to_camera_trajectory_retrieval", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "cheapest_flight_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "newspaper_ocr_in_query_box", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "stock_info_retrieval", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "graph_theory", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "landmark_recognition_and_qa", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "muma_theory_of_mind_social_goal", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "clevrer_video_moving_object_property_recognition", + "score": 0.375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "action_sequence", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "9-image or more" + }, + { + "name": "game_info_retrieval", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "av_view_identification", + "score": 0.08888888888888889, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "code_programming_test_advanced", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "muma_theory_of_mind_belief_of_goal", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "code_programming_extremely_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Planning and Decision Making" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "geometry_solid", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "top_rated_hotel_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "stock_info_parsing", + "score": 0.8025210084033614, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "pmc_vqa_medical_image_qa", + "score": 0.47368421052631576, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "3d_fragments_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "electricity_future_prediction_from_table", + "score": 0.04578947368421055, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "TV_show_info_parsing", + "score": 0.2301587301587302, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "coco_object_detection_by_query_property", + "score": 0.3504119352087328, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "vlnqa_egocentric_navigation_video", + "score": 0.0625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "landmark_check_two_images", + "score": 0.4666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "figureqa", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_biology", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "traffic_future_prediction_from_line_plot", + "score": 0.015315789473684218, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "mvsa_sentiment_classification", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "stock_price_future_prediction", + "score": 0.039, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multi_load_type_prediction_from_plot", + "score": 0.05952380952380952, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "hotel_booking_confirmation_parsing", + "score": 0.5571428571428572, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_shortest_path_planar", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "action_prediction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "code_error_line_identification", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "math_parity", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "famous_building_recognition", + "score": 0.125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "multilingual_game_info_parsing", + "score": 0.2767857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "weather_info_parsing", + "score": 0.3968253968253968, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_shortest_path_kamada_kawai", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "widerface_face_count_and_event_classification", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "nextqa_mc", + "score": 0.631578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "ti_fused_vqa_physics", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "movie_info_retrieval", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "graph_chordless_cycle", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "clevrer_video_moving_object_count", + "score": 0.38095238095238093, + "eval_type": "rule", + "num_demo": 1, + "num_query": 21, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "video" + }, + { + "name": "geometry_length", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "algebra", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "human_relationship_reasoning", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Commonsense and Social Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "exchange_rate_estimate_plot", + "score": 0.6037857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "electricity_load_estimate_plot", + "score": 0.10771428571428569, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "logo2k_same_type_logo_retrieval", + "score": 0.17857142857142858, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "single_person_pose_estimation", + "score": 0.09968483232794643, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_math", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "electricity_plot_future_prediction", + "score": 0.1965947368421053, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "graph_hamiltonian_path", + "score": 0.48214285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "iconqa", + "score": 0.10526315789473684, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "movie_info_parsing", + "score": 0.11607142857142858, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "youtube_video_info_parsing", + "score": 0.4047619047619047, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "egocentric_analysis_single_image", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "perception_test_video_action_count", + "score": 0.375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "video" + }, + { + "name": "music_info_parsing", + "score": 0.017857142857142856, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "mnist_pattern", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "newspaper_page_parse_and_count", + "score": 0.3111111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_descriptive", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_relation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_depth", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_distance", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_count", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geographic_remote_sensing_land_cover", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "symbolic_graphics_programs_computer_aided_design", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "symbolic_graphics_programs_scalable_vector_graphics", + "score": 0.05555555555555555, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_asia", + "score": 0.042857142857142864, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_americas", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_europe", + "score": 0.042857142857142864, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_africa", + "score": 0.014285714285714287, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "visual_prediction_rater_plane_segmentation", + "score": 0.04444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_novel_view_synthesis", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_3d_assembled_quality_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "visual_prediction_rater_semantic_segmentation", + "score": 0.020833333333333332, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_panoptic_segmentation", + "score": 0.047619047619047616, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_surface_normal_estimation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_depth_estimation", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_openable_part_segmentation", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "medical_abdomen_MRI_organ_recognition", + "score": 0.047619047619047616, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "visual_correspondance_in_two_images", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "google_streetview_circle_reasoning", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "vln_tegulu_next_step", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "flowchart_code_generation", + "score": 0.1111111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "video_eval_factual_pref", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "medical_polyp_segmentation_single_object_rater", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "recipe_image_ordering", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "healthcare_info_judgement", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Ethical and Safety Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "relative_depth_of_different_points", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "planning_visual_storage", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_visualization_output_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 10, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "Ad_count_detection", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "TRANCE_physics_reasoning_view", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "video_grounding_temporal", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "chess_sygyzy_endgames", + "score": 0.06598639455782314, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_visual_blocksworld", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "web_action_grounding", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "position_relationship", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Web_Multi", + "score": 0.21429807404733187, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "webpage_code_understanding", + "score": 0.2222222222222222, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "vln_english_next_step", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "logical_reasoning_2d_folding", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "extract_webpage_headline", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_translation_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "music_sheet_note_count", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_circled_letter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_find_odd_one_out", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "paper_vqa", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "medical_abdomen_endscopy_organ_recognition", + "score": 0.023809523809523808, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "worldle", + "score": 0.13949694490239214, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "medical_keywords_based_retrieval_non_radiology", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "remaining_playback_time_calculation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "sign_language", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "video_camera_motion_description", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "exact_text", + "num_input": "video" + }, + { + "name": "video_segments_reordering", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "structured_output", + "num_input": "video" + }, + { + "name": "interpret_force_perspective_illusion", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Web_Single", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "hashtag_recommendation", + "score": 0.7452380952380953, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_tyreworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_grounding_spatial", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "weather_map_climate_type_temperature_parsing", + "score": 0.4642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_match_problem", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "music_sheet_sentiment", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "scibench_calculus_wo_solution", + "score": 0.04081632653061224, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_eval_visual_pref", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "image_translation_en2cn", + "score": 0.04695531121001506, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "game_platform_support_identification", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "music_sheet_format_QA", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "CLEVRER_physics", + "score": 0.3, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "location_vqa", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "semantic_matching_of_two_images", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "planning_visual_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "geometry_reasoning_nested_squares", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "circuit_diagram_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "scibench_fundamental_wo_solution", + "score": 0.02040816326530612, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_visual_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "planning_visual_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "code_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "medical_cell_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_blocksworld", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_storage", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "distinguish_ai_generated_image", + "score": 0.15789473684210525, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_count_line_intersections", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_grippers", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "booking_web_recommendation", + "score": 0.42823129251700676, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "photoshop_operation", + "score": 0.15357142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "llavaguard", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "media_homepage_profile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "vln_hindi_next_step", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "media_recommend_solutions_stackoverflow", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "dish_ingredient_match", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "medical_retrieval_given_surgeon_activity", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Videos", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "google_streetview_direction_understanding", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "music_sheet_name", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "code_translation_easy", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "ancient_map_understanding", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "mindmap_elements_parsing", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "autonomous_driving_scene_analysis", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_2D_views_of_3D_shapes", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "paper_review_acceptance", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "rocks_samples_compare", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "google_streetview_line_reasoning", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "orchestra_score_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "highest_discount_game_price_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "video_intent_recognition", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "counting", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "media_QA_web_stackoverflow", + "score": 0.33333333333333337, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_solution_compare", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "soccer_offside", + "score": 0.2222222222222222, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "2d_image_jigsaw_puzzle_easy", + "score": 0.028571428571428574, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "medical_content_based_retrieval_radiology", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "song_title_identification_from_lyrics", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MMMU_pro_exam_screenshot", + "score": 0.030303030303030304, + "eval_type": "rule", + "num_demo": 1, + "num_query": 99, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "multiview_reasoning_camera_moving", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "medical_multi_organ_segmentation_rater", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "knowledge_graph_understanding", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "rocks_samples_identify", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "mensa_iq_test", + "score": 0.1691176470588235, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "play_go_capture_stone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "medical_blood_vessels_recognition", + "score": 0.10714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "booking_web_rating", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "web_action_prediction", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "top_video_creator_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_visual_grippers", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "annoying_word_search", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Mobile_tap", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "comic_page_ordering", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "geometry_reasoning_grid", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "TRANCE_physics_reasoning_event", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "planning_screenshot_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Mobile_swipe", + "score": 0.4372276395262732, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "recover_masked_word_in_figure", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "vln_identify_robot", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "9-image or more" + }, + { + "name": "multilingual_news_qa", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "move_pos_to_pos_hanoi_4_pole", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "relative_reflectance_of_different_regions", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "counting_multi_image", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "code_translation_advanced", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "medical_image_artifacts_indentification", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "signage_navigation", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "transit_map_intersection_points", + "score": 0.06547619047619048, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_execution", + "score": 0.0625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "icon_arithmetic_puzzle", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_overlapped_circle", + "score": 0.39285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "google_streetview_circle_sorting", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "chinese_idiom_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "knowledge_sign_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "monthly_weather_days_count", + "score": 0.2380952380952381, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "calendar_schedule_suggestion", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "music_sheet_author", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "ascii_art_understanding", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "chess_find_legal_moves", + "score": 0.028492188570677254, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "topological_sort", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "entertainment_web_game_style", + "score": 0.10714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_action_recognition", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "structured_output", + "num_input": "video" + }, + { + "name": "functionality_matching_in_different_objects", + "score": 0.17857142857142858, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "Movie_retrieval_by_actor", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "TV_show_retrieval_by_character", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "Forensic_Detection_of_different_images", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "polygon_interior_angles", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "pokemon_3D_recognition", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "mahjong", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "font_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "paper_review_rating", + "score": 0.5778868460875585, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "numerical_data", + "num_input": "4-5 images" + }, + { + "name": "number_comparison", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "medical_counting_lymphocytes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "actor_recognition_in_Movie", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "TRANCE_physics_reasoning_basic", + "score": 0.058823529411764705, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "waldo", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "google_streetview_line_sorting", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "video_eval_dynamic_pref", + "score": 0.375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "product_ocr_qa", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "Bongard_Problem", + "score": 0.16666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_translation_Python", + "score": 0.0625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "logical_reasoning_fit_pattern", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "road_map_find_highway_between_two_place", + "score": 0.5294117647058824, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ishihara_test", + "score": 0.18571428571428572, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_parasite_detection", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "LaTeX_complex_formula_convertion", + "score": 0.058823529411764705, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "vln_identify_location", + "score": 0.16969696969696965, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "rebus", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 23, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "constrained_generation_multi_contain_repeat_position_only_length", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_multi_contain_repeat", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_position_length", + "score": 0.4, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_xor_images", + "score": 1.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_position_images", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_repeat_length", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_contain_length", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_multi_contain_position_only", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_contain_images", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_length", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_equality", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_crushing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_checkmate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "panel_images_single_question", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "panel_images_multi_question", + "score": 0.5476190476190477, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_tiktok", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_notes", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_alipay", + "score": 0.058823529411764705, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_amazon", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_instagram", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_ppt", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_leetcode", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_twitter", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_zoom", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_iphone_settings", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_youtube", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_excel", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_word", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "crossword_mini_5x5", + "score": 0.13571428571428573, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MMSoc_HatefulMemes", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_GossipCop", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Object Recognition and Classification", + "Scene and Event Understanding", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_PolitiFact", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Memotion", + "score": 0.6823529411764707, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "reward_models_I2T_reward", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "reward_models_T2I_reward", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "ocr_math_MATH", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "ocr_math_TheoremQA", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "counterfactual_arithmetic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_homepage", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_publication", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_blogpost", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "table_understanding_complex_question_answering", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "table_understanding_fact_verification", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_swap", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_style", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_clip_stable_diffusion_generate", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_veracity", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_attribute_edit", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_entity_replace", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_out_of_context", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_background_change", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "video_motion_matching_3D_real", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "video_motion_matching_real_3D", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "number_puzzle_kakuro_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "number_puzzle_sudoku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ball_cup_swap_3", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "maze_2d_8x8", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "shape_composition_shapes", + "score": 0.14540816326530612, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "shape_composition_colours", + "score": 0.09268707482993196, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_chinese_celebrity", + "score": 0.10714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_indian_celebrity", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_papers", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_famous_treaty", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_csv", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_school_plain", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_text_latex", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_skill_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_experience_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_article_authors", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_markdown", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_html", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_employer_plain", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_equation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_article_journal", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_latex", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "autorater_subject", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_artifact", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "autorater_control", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_aesthetics", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_mask", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_artifact_reason", + "score": 0.4, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "autorater_semantics", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_motion_guided_editing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "autorater_3d_model_texturing", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Metrics", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_unmask", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "poetry_haiku", + "score": 0.4, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_limerick", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_petrarchian_sonnet_optional_meter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_custom_rhyming_scheme", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic_alliteration", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_shakespearean_sonnet", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "pictionary_cartoon_drawing_guess", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_doodle_guess", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_skribbl_io", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_genai_output_chinese", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_chinese_food_img2en", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_tiktok", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_alipay", + "score": 0.058823529411764705, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_amazon", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_instagram", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_ppt", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_leetcode", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_twitter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_zoom", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_iphone_settings", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_youtube", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_excel", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_word", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "3d_indoor_scene_text_bbox_selection", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "tqa_textbook_qa", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "table_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "handwritten_math_expression_extraction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_output_result", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "kvqa_knowledge_aware_qa", + "score": 0.3157894736842105, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MMMU_physics_chemistry_MCQ", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cultural_vqa", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_add_tag", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "realworld_qa_en2cn", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "arxiv_vqa", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "character_recognition_in_TV_shows", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "painting_QA", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "app_layout_understanding_notes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "scibench_w_solution_open_ended", + "score": 0.154, + "eval_type": "llm", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "electrocardiogram", + "score": 0.07857142857142858, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funny_image_title", + "score": 0.09285714285714287, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "image_captioning_with_additional_requirements", + "score": 0.17857142857142858, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "meme_explain", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "image_humor_understanding", + "score": 0.6241379310344828, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "doc_vqa", + "score": 0.7625000000000002, + "eval_type": "llm", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "4-5 images" + }, + { + "name": "generated_video_artifacts", + "score": 0.3625, + "eval_type": "llm", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "sceneqa_scene_transition_video", + "score": 0.16428571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "nextqa_oe", + "score": 0.32631578947368417, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "tweets_captioning", + "score": 0.25, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "defeasible_reasoning", + "score": 0.6, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "vibe-eval", + "score": 0.3714285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "ocrqa", + "score": 0.49310344827586217, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_content_follow_up", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Planning and Decision Making" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "figurative_speech_explanation", + "score": 0.5896551724137931, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "wikihow_complex_task_completion", + "score": 0.17777777777777776, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "9-image or more" + }, + { + "name": "docci_image_description_long", + "score": 0.2928571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_detail_description", + "score": 0.12105263157894736, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "video2notes", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "funqa_unexpected_action_magic_video", + "score": 0.2533333333333333, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "video_qa", + "score": 0.3357142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "bar_chart_interpretation", + "score": 0.3931034482758621, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_summary", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "iq_test", + "score": 0.34482758620689646, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funqa_unexpected_action_creative_video", + "score": 0.25999999999999995, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "unusual_images", + "score": 0.4482758620689656, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "GUI_Chat_Hard", + "score": 0.3709677419354839, + "eval_type": "llm", + "num_demo": 1, + "num_query": 31, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "graph_interpretation", + "score": 0.18620689655172415, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "traffic_accident_analysis", + "score": 0.5428571428571429, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "humor_explanation", + "score": 0.5066666666666667, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funqa_unexpected_action_humor_video", + "score": 0.17333333333333334, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "video_short_title", + "score": 0.10714285714285716, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "GUI_Chat_Easy", + "score": 0.39230769230769236, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "table2latex_complex", + "score": 0.07777777777777778, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "activitynetqa", + "score": 0.22631578947368425, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "visualization_with_code", + "score": 0.5142857142857142, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "science_figure_explanation", + "score": 0.3620689655172414, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "guess_image_generation_prompt", + "score": 0.5894736842105264, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "art_explanation", + "score": 0.23448275862068965, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "paper_review_writing", + "score": 0.13999999999999999, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "4-5 images" + }, + { + "name": "bridge_strategies_worldclass", + "score": 0.44285714285714295, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_expert", + "score": 0.47142857142857153, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_advanced", + "score": 0.2285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Spanish", + "score": 0.049999999999999996, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Arabic", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Japanese", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Russian", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_English", + "score": 0.20714285714285716, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_French", + "score": 0.03571428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "table_understanding_fetaqa", + "score": 0.1857142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visual_order_B", + "score": 0.8526315789473685, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "red_teaming_racial", + "score": 0.7, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_captcha", + "score": 0.10000000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_celebrity", + "score": 0.9, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visual_order_A", + "score": 0.8631578947368422, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "red_teaming_politics", + "score": 0.675, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visualmisleading", + "score": 0.7578947368421054, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_jailbreak", + "score": 0.8150000000000002, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Text Recognition (OCR)", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Spanish", + "score": 0.24285714285714283, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Arabic", + "score": 0.25, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Russian", + "score": 0.19285714285714287, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_French", + "score": 0.1714285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Chinese", + "score": 0.24285714285714288, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "ascii_art_30", + "score": 0.14285714285714285, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + } +] \ No newline at end of file diff --git a/static/eval_results/Default/Mammoth_VL/summary_results.json b/static/eval_results/Default/Mammoth_VL/summary_results.json new file mode 100644 index 0000000000000000000000000000000000000000..6b2cc0baf5cb8d8e9cfd4184289f35fba2e6c779 --- /dev/null +++ b/static/eval_results/Default/Mammoth_VL/summary_results.json @@ -0,0 +1,251 @@ +{ + "model_summary": { + "core": { + "num_eval_tasks": 440, + "num_eval_samples": 6539, + "macro_mean_score": 0.264052880412689, + "micro_mean_score": 0.2626894374387823 + }, + "open": { + "num_eval_tasks": 65, + "num_eval_samples": 1163, + "macro_mean_score": 0.37992668750165337, + "micro_mean_score": 0.40120378331900275 + }, + "overall_score": 0.27896733083008046 + }, + "keyword_stats": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.30194776127683565 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.2365295791606494 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.2993927028494267 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.3366347826116991 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.2408454736444444 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.37895522991264047 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.48003508771929826 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.27232427744946475 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.24522937191710698 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.11457024299726488 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.18941525254390731 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.1718334741390191 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.28108187023954245 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.3391119999611432 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.36434285930327387 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.36915384448504296 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.15940750469262005 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.2456942956200745 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.21586513216389874 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.29359048024032264 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.2646677074112521 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.34733130661096645 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.3286125236284589 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.16358654572940287 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.25463059203015115 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.2919119209789575 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.3391119999611432 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.20016011839130254 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.2679179451692527 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.23600902063965679 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.15326915093278803 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.20668466311255687 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.33348955971237954 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.3759170425350556 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.23894961766260706 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.351703435685048 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.26074348700688493 + } + } + } +} \ No newline at end of file diff --git a/static/eval_results/Default/Mammoth_VL/task_results.json b/static/eval_results/Default/Mammoth_VL/task_results.json new file mode 100644 index 0000000000000000000000000000000000000000..f6bb4d6e41528d198bb96b60aec60cc1550013fd --- /dev/null +++ b/static/eval_results/Default/Mammoth_VL/task_results.json @@ -0,0 +1,7756 @@ +[ + { + "name": "face_identity_matching", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "4-5 images" + }, + { + "name": "IAM_line_ocr_and_locate", + "score": 0.310472219974674, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "science_molecule_chemistry", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "weather_info_retrieval", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "signboard_identification", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "av_multicamera_tracking_predict_bbox", + "score": 0.030820962942379463, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "9-image or more" + }, + { + "name": "funsd_document_qa", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "physical_property_reasoning", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "geometry_area", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geometry_analytic", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "3d_indoor_scene_text_bbox_selection", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "map_diagram_qa", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "clevr_arithmetic", + "score": 0.5789473684210527, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "graph_connectivity", + "score": 0.36666666666666664, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "visualdial_visual_dialog_image_guessing", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "graph_isomorphism", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "clevrer_object_existence_video", + "score": 0.875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "question_solution_solving", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "snli_ve_visual_entailment", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_chemistry", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ili_ratio_future_prediction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "image_style_recognition", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "multilingual_movie_info_parsing", + "score": 0.12244897959183672, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "star_object_interaction_video", + "score": 0.4375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "video_content_reasoning", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "geometry_transformation", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "places365_similar_scene_retrieval", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "vibe_eval_short_phrase", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "cam_traj_to_video_selection", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "math_breakpoint", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "photo_sharing_image_retrieval", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "perception_test_video_character_order", + "score": 0.3125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "coco_ood_global_image_retrieval_by_query_property", + "score": 0.3629715522572665, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "arc_agi", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "vizwiz_quality_accessment_for_blind", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "av_human_multiview_counting", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "6-8 images" + }, + { + "name": "waybill_number_sequence_extraction", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "emotion_recognition", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "6-8 images" + }, + { + "name": "brand_logo_recognition_and_elaboration", + "score": 0.56, + "eval_type": "rule", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "license_plate_recognition", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chess_puzzle_single_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_programming_test_easy", + "score": 0.041666666666666664, + "eval_type": "rule", + "num_demo": 1, + "num_query": 24, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_winner_identification", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chart_vqa", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "math_convexity_value_estimation", + "score": 0.34586160183944936, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "3d_indoor_scene_text_bbox_prediction", + "score": 0.11160580091854058, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "long_string_letter_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "dvqa", + "score": 0.7894736842105263, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "egocentric_spatial_reasoning", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "music_info_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "quizlet_question_solving", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "coco_person_detection", + "score": 0.5818241712483578, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "clevrer_moving_direction_video", + "score": 0.0625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "graph_maxflow", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "tqa_textbook_qa", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "game_info_parsing", + "score": 0.2792207792207792, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_hamiltonian_cycle", + "score": 0.3392857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "av_vehicle_multiview_counting", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "6-8 images" + }, + { + "name": "animal_pose_estimation", + "score": 0.21513761521109695, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "humor_understand_caption_match", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "table_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "super_clevr", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "average_humidity_estimate_plot", + "score": 0.5893333333333335, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "nlvr2_two_image_compare_qa", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "sta_action_localization_video", + "score": 0.1875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "face_keypoint_detection", + "score": 0.7907526968967371, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "insect_order_classification", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "deciphering_oracle_bone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "places365_scene_type_classification", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "science_basic_physics", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "perception_test_object_shuffle_video", + "score": 0.375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "long_string_number_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_programming_test_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "handwritten_math_expression_extraction", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "video_to_camera_trajectory_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "cheapest_flight_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "newspaper_ocr_in_query_box", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "stock_info_retrieval", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "graph_theory", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "landmark_recognition_and_qa", + "score": 0.33333333333333326, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "muma_theory_of_mind_social_goal", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "clevrer_video_moving_object_property_recognition", + "score": 0.8125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "action_sequence", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "9-image or more" + }, + { + "name": "game_info_retrieval", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "av_view_identification", + "score": 0.08888888888888888, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "code_programming_test_advanced", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "muma_theory_of_mind_belief_of_goal", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "code_programming_extremely_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Planning and Decision Making" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "geometry_solid", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "top_rated_hotel_identification", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "stock_info_parsing", + "score": 0.06722689075630253, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "pmc_vqa_medical_image_qa", + "score": 0.6842105263157895, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "3d_fragments_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "electricity_future_prediction_from_table", + "score": 0.5094736842105263, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "TV_show_info_parsing", + "score": 0.253968253968254, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "coco_object_detection_by_query_property", + "score": 0.5344949749908332, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "vlnqa_egocentric_navigation_video", + "score": 0.1875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "landmark_check_two_images", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "code_output_result", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "figureqa", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_biology", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "traffic_future_prediction_from_line_plot", + "score": 0.5191578947368422, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "mvsa_sentiment_classification", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "stock_price_future_prediction", + "score": 0.3442857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "kvqa_knowledge_aware_qa", + "score": 0.5789473684210527, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multi_load_type_prediction_from_plot", + "score": 0.4523809523809523, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "hotel_booking_confirmation_parsing", + "score": 0.028571428571428574, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_shortest_path_planar", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "action_prediction", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "code_error_line_identification", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "math_parity", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "famous_building_recognition", + "score": 0.6875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "multilingual_game_info_parsing", + "score": 0.15178571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "weather_info_parsing", + "score": 0.4087301587301588, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_shortest_path_kamada_kawai", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "widerface_face_count_and_event_classification", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "nextqa_mc", + "score": 0.7894736842105263, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "ti_fused_vqa_physics", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "movie_info_retrieval", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "graph_chordless_cycle", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "clevrer_video_moving_object_count", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 21, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "video" + }, + { + "name": "geometry_length", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "algebra", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "human_relationship_reasoning", + "score": 0.9375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Commonsense and Social Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "exchange_rate_estimate_plot", + "score": 0.5852857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "electricity_load_estimate_plot", + "score": 0.35735714285714293, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "logo2k_same_type_logo_retrieval", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "single_person_pose_estimation", + "score": 0.17949711661260873, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_math", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "electricity_plot_future_prediction", + "score": 0.4398578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "graph_hamiltonian_path", + "score": 0.40476190476190477, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "iconqa", + "score": 0.42105263157894735, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "movie_info_parsing", + "score": 0.1875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "youtube_video_info_parsing", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "egocentric_analysis_single_image", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "perception_test_video_action_count", + "score": 0.4375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "video" + }, + { + "name": "music_info_parsing", + "score": 0.16071428571428573, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "mnist_pattern", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "newspaper_page_parse_and_count", + "score": 0.3777777777777778, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_descriptive", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_relation", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_depth", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_distance", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_count", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geographic_remote_sensing_land_cover", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "symbolic_graphics_programs_computer_aided_design", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "symbolic_graphics_programs_scalable_vector_graphics", + "score": 0.16666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_asia", + "score": 0.1142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_americas", + "score": 0.1571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_europe", + "score": 0.09999999999999999, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_africa", + "score": 0.1285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "visual_prediction_rater_plane_segmentation", + "score": 0.15555555555555553, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_novel_view_synthesis", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_3d_assembled_quality_understanding", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "visual_prediction_rater_semantic_segmentation", + "score": 0.12499999999999999, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_panoptic_segmentation", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_surface_normal_estimation", + "score": 0.2619047619047619, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_depth_estimation", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_openable_part_segmentation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "medical_abdomen_MRI_organ_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "visual_correspondance_in_two_images", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "google_streetview_circle_reasoning", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "vln_tegulu_next_step", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "flowchart_code_generation", + "score": 0.2222222222222222, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "video_eval_factual_pref", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "medical_polyp_segmentation_single_object_rater", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "recipe_image_ordering", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "healthcare_info_judgement", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Ethical and Safety Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "relative_depth_of_different_points", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "planning_visual_storage", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_visualization_output_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 10, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "Ad_count_detection", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "TRANCE_physics_reasoning_view", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "video_grounding_temporal", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "chess_sygyzy_endgames", + "score": 0.1064625850340136, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_visual_blocksworld", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "web_action_grounding", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "position_relationship", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Web_Multi", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "webpage_code_understanding", + "score": 0.7777777777777778, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "vln_english_next_step", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "logical_reasoning_2d_folding", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "extract_webpage_headline", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_translation_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "music_sheet_note_count", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_circled_letter", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_find_odd_one_out", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "paper_vqa", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "medical_abdomen_endscopy_organ_recognition", + "score": 0.09523809523809523, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "worldle", + "score": 0.16417078019580122, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "medical_keywords_based_retrieval_non_radiology", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "remaining_playback_time_calculation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "sign_language", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "video_camera_motion_description", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "exact_text", + "num_input": "video" + }, + { + "name": "video_segments_reordering", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "structured_output", + "num_input": "video" + }, + { + "name": "interpret_force_perspective_illusion", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Web_Single", + "score": 0.0020726462511010965, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "hashtag_recommendation", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_tyreworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_grounding_spatial", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "weather_map_climate_type_temperature_parsing", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_match_problem", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "music_sheet_sentiment", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "scibench_calculus_wo_solution", + "score": 0.04081632653061224, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "MMMU_physics_chemistry_MCQ", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_eval_visual_pref", + "score": 0.5625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "image_translation_en2cn", + "score": 0.22423936011541246, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "game_platform_support_identification", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "music_sheet_format_QA", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "cultural_vqa", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "CLEVRER_physics", + "score": 0.3, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "location_vqa", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "semantic_matching_of_two_images", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "planning_visual_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "geometry_reasoning_nested_squares", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "circuit_diagram_understanding", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "scibench_fundamental_wo_solution", + "score": 0.061224489795918366, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_visual_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "planning_visual_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "code_retrieval", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "medical_cell_recognition", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_blocksworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_storage", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "distinguish_ai_generated_image", + "score": 0.6842105263157895, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_count_line_intersections", + "score": 0.32142857142857145, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_grippers", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "booking_web_recommendation", + "score": 0.3815192743764172, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "photoshop_operation", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "llavaguard", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "media_homepage_profile", + "score": 0.005952380952380952, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "vln_hindi_next_step", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "media_recommend_solutions_stackoverflow", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "dish_ingredient_match", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "medical_retrieval_given_surgeon_activity", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Videos", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "google_streetview_direction_understanding", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "music_sheet_name", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "code_translation_easy", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "ancient_map_understanding", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "mindmap_elements_parsing", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_add_tag", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "autonomous_driving_scene_analysis", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_2D_views_of_3D_shapes", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "paper_review_acceptance", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "rocks_samples_compare", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "realworld_qa_en2cn", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "google_streetview_line_reasoning", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "orchestra_score_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "highest_discount_game_price_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "video_intent_recognition", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "counting", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "media_QA_web_stackoverflow", + "score": 0.42857142857142866, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_solution_compare", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "soccer_offside", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "2d_image_jigsaw_puzzle_easy", + "score": 0.15, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "medical_content_based_retrieval_radiology", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "song_title_identification_from_lyrics", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MMMU_pro_exam_screenshot", + "score": 0.18181818181818182, + "eval_type": "rule", + "num_demo": 1, + "num_query": 99, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "multiview_reasoning_camera_moving", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "medical_multi_organ_segmentation_rater", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "knowledge_graph_understanding", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "rocks_samples_identify", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "mensa_iq_test", + "score": 0.3245098039215687, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "play_go_capture_stone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "medical_blood_vessels_recognition", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "booking_web_rating", + "score": 0.17857142857142858, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "web_action_prediction", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "arxiv_vqa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "top_video_creator_identification", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_visual_grippers", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "annoying_word_search", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Mobile_tap", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "comic_page_ordering", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "geometry_reasoning_grid", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "TRANCE_physics_reasoning_event", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "planning_screenshot_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Mobile_swipe", + "score": 0.5792190745087759, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "recover_masked_word_in_figure", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "character_recognition_in_TV_shows", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "vln_identify_robot", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "9-image or more" + }, + { + "name": "multilingual_news_qa", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "move_pos_to_pos_hanoi_4_pole", + "score": 0.0503968253968254, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "relative_reflectance_of_different_regions", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "counting_multi_image", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "code_translation_advanced", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "medical_image_artifacts_indentification", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "signage_navigation", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "transit_map_intersection_points", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_execution", + "score": 0.1875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "icon_arithmetic_puzzle", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_overlapped_circle", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "google_streetview_circle_sorting", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "chinese_idiom_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "knowledge_sign_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "monthly_weather_days_count", + "score": 0.2380952380952381, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "calendar_schedule_suggestion", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "painting_QA", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "music_sheet_author", + "score": 0.1875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "ascii_art_understanding", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "chess_find_legal_moves", + "score": 0.029659344301921898, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "topological_sort", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "entertainment_web_game_style", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_action_recognition", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "structured_output", + "num_input": "video" + }, + { + "name": "functionality_matching_in_different_objects", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "Movie_retrieval_by_actor", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "TV_show_retrieval_by_character", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "Forensic_Detection_of_different_images", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "polygon_interior_angles", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "pokemon_3D_recognition", + "score": 0.11666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "mahjong", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "font_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "paper_review_rating", + "score": 0.7013765649868268, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "numerical_data", + "num_input": "4-5 images" + }, + { + "name": "number_comparison", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "medical_counting_lymphocytes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "actor_recognition_in_Movie", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "TRANCE_physics_reasoning_basic", + "score": 0.8823529411764706, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "waldo", + "score": 0.0005013842565343441, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "google_streetview_line_sorting", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "video_eval_dynamic_pref", + "score": 0.375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "product_ocr_qa", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "Bongard_Problem", + "score": 0.10526315789473684, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_translation_Python", + "score": 0.08333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "logical_reasoning_fit_pattern", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "road_map_find_highway_between_two_place", + "score": 0.35294117647058826, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ishihara_test", + "score": 0.3857142857142858, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_parasite_detection", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "LaTeX_complex_formula_convertion", + "score": 0.17647058823529413, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "vln_identify_location", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "rebus", + "score": 0.08695652173913043, + "eval_type": "rule", + "num_demo": 1, + "num_query": 23, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "constrained_generation_multi_contain_repeat_position_only_length", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_multi_contain_repeat", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_position_length", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_xor_images", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_position_images", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_repeat_length", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_contain_length", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_multi_contain_position_only", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_contain_images", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_length", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_equality", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_crushing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_checkmate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "panel_images_single_question", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "panel_images_multi_question", + "score": 0.642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_tiktok", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_notes", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_alipay", + "score": 0.4117647058823529, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_amazon", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_instagram", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_ppt", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_leetcode", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_twitter", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_zoom", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_iphone_settings", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_youtube", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_excel", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_word", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "crossword_mini_5x5", + "score": 0.014285714285714287, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MMSoc_HatefulMemes", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_GossipCop", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Object Recognition and Classification", + "Scene and Event Understanding", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_PolitiFact", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Memotion", + "score": 0.6705882352941178, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "reward_models_I2T_reward", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "reward_models_T2I_reward", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "ocr_math_MATH", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "ocr_math_TheoremQA", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "counterfactual_arithmetic", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_homepage", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_publication", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_blogpost", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "table_understanding_complex_question_answering", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "table_understanding_fact_verification", + "score": 0.5952380952380952, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_swap", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_style", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_clip_stable_diffusion_generate", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_veracity", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_attribute_edit", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_entity_replace", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_out_of_context", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_background_change", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "video_motion_matching_3D_real", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "video_motion_matching_real_3D", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "number_puzzle_kakuro_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "number_puzzle_sudoku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ball_cup_swap_3", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "maze_2d_8x8", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "shape_composition_shapes", + "score": 0.2636904761904762, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "shape_composition_colours", + "score": 0.23069727891156463, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_chinese_celebrity", + "score": 0.17857142857142858, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_indian_celebrity", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_papers", + "score": 0.1, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_famous_treaty", + "score": 0.32142857142857145, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_csv", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_school_plain", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_text_latex", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_skill_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_experience_plain", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_article_authors", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_markdown", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_html", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_employer_plain", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_equation", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_article_journal", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_latex", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "autorater_subject", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_artifact", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "autorater_control", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_aesthetics", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_mask", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_artifact_reason", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "autorater_semantics", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_motion_guided_editing", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "autorater_3d_model_texturing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Metrics", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_unmask", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "poetry_haiku", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_limerick", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_petrarchian_sonnet_optional_meter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_custom_rhyming_scheme", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic_alliteration", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_shakespearean_sonnet", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "pictionary_cartoon_drawing_guess", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_doodle_guess", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_skribbl_io", + "score": 0.05, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_genai_output_chinese", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_chinese_food_img2en", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_tiktok", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_notes", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_alipay", + "score": 0.17647058823529413, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_amazon", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_instagram", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_ppt", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_leetcode", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_twitter", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_zoom", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_iphone_settings", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_youtube", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_excel", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_word", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "scibench_w_solution_open_ended", + "score": 0.08800000000000001, + "eval_type": "llm", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "electrocardiogram", + "score": 0.27142857142857146, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funny_image_title", + "score": 0.6285714285714284, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "image_captioning_with_additional_requirements", + "score": 0.7500000000000001, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "meme_explain", + "score": 0.4214285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "image_humor_understanding", + "score": 0.4862068965517241, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "doc_vqa", + "score": 0.51875, + "eval_type": "llm", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "4-5 images" + }, + { + "name": "generated_video_artifacts", + "score": 0.09375000000000001, + "eval_type": "llm", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "sceneqa_scene_transition_video", + "score": 0.19285714285714284, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "nextqa_oe", + "score": 0.33157894736842103, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "tweets_captioning", + "score": 0.45, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "defeasible_reasoning", + "score": 0.593103448275862, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "vibe-eval", + "score": 0.40714285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "ocrqa", + "score": 0.6724137931034483, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_content_follow_up", + "score": 0.5642857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Planning and Decision Making" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "figurative_speech_explanation", + "score": 0.6965517241379311, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "wikihow_complex_task_completion", + "score": 0.8111111111111112, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "9-image or more" + }, + { + "name": "docci_image_description_long", + "score": 0.6714285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_detail_description", + "score": 0.34736842105263155, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "video2notes", + "score": 0.17142857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "funqa_unexpected_action_magic_video", + "score": 0.29999999999999993, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "video_qa", + "score": 0.3285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "bar_chart_interpretation", + "score": 0.40689655172413797, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_summary", + "score": 0.3928571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "iq_test", + "score": 0.2344827586206896, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funqa_unexpected_action_creative_video", + "score": 0.22000000000000006, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "unusual_images", + "score": 0.506896551724138, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "GUI_Chat_Hard", + "score": 0.3838709677419356, + "eval_type": "llm", + "num_demo": 1, + "num_query": 31, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "graph_interpretation", + "score": 0.33103448275862074, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "traffic_accident_analysis", + "score": 0.1285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "humor_explanation", + "score": 0.45999999999999996, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funqa_unexpected_action_humor_video", + "score": 0.26666666666666666, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "video_short_title", + "score": 0.6, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "GUI_Chat_Easy", + "score": 0.5307692307692307, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "table2latex_complex", + "score": 0.3777777777777777, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "activitynetqa", + "score": 0.5421052631578946, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "visualization_with_code", + "score": 0.25000000000000006, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "science_figure_explanation", + "score": 0.4724137931034483, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "guess_image_generation_prompt", + "score": 0.7999999999999999, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "art_explanation", + "score": 0.3241379310344827, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "paper_review_writing", + "score": 0.4333333333333334, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "4-5 images" + }, + { + "name": "bridge_strategies_worldclass", + "score": 0.03571428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_expert", + "score": 0.45000000000000007, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_advanced", + "score": 0.1285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Spanish", + "score": 0.028571428571428574, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Arabic", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Japanese", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Russian", + "score": 0.0071428571428571435, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_English", + "score": 0.0071428571428571435, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_French", + "score": 0.014285714285714287, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "table_understanding_fetaqa", + "score": 0.32857142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visual_order_B", + "score": 0.9157894736842105, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "red_teaming_racial", + "score": 0.615, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_captcha", + "score": 0.16315789473684214, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_celebrity", + "score": 0.7700000000000002, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visual_order_A", + "score": 0.8736842105263158, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "red_teaming_politics", + "score": 0.675, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visualmisleading", + "score": 0.9105263157894737, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_jailbreak", + "score": 0.4, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Text Recognition (OCR)", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Spanish", + "score": 0.15, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Arabic", + "score": 0.07857142857142858, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Russian", + "score": 0.09285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_French", + "score": 0.12142857142857144, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Chinese", + "score": 0.2571428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "ascii_art_30", + "score": 0.21428571428571427, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + } +] \ No newline at end of file diff --git a/static/eval_results/Default/MiniCPM_v2.6/summary_results.json b/static/eval_results/Default/MiniCPM_v2.6/summary_results.json new file mode 100644 index 0000000000000000000000000000000000000000..636b1496744d89284ea5089d88cce3d34abddac2 --- /dev/null +++ b/static/eval_results/Default/MiniCPM_v2.6/summary_results.json @@ -0,0 +1,251 @@ +{ + "model_summary": { + "core": { + "num_eval_tasks": 440, + "num_eval_samples": 6539, + "macro_mean_score": 0.22955895202146906, + "micro_mean_score": 0.22560399396899078 + }, + "open": { + "num_eval_tasks": 65, + "num_eval_samples": 1163, + "macro_mean_score": 0.41728623355613875, + "micro_mean_score": 0.43452278589853827 + }, + "overall_score": 0.2537218694467236 + }, + "keyword_stats": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.2604967101191775 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.2500331562865158 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.3003169369011028 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.31808748114668184 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.18281637763548025 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.40732197204308807 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.48798245614035085 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.23723675736151562 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.1968926733821904 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.08735883237069725 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.21195711598986072 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.18639148159043903 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.21578309681746147 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.3527537836840162 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.3096882575625531 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.3176880312524649 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.0755920550038197 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.23506388020592064 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.1781127776443048 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.2551275278138797 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.20833171754655547 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.36473950920880716 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.293386806641223 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.13955971277399848 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.23596215721092323 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.26319603880798287 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.3527537836840162 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.17888270664238365 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.22288558250834017 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.2666989364424082 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.11693267119342445 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.15342045420318667 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.29243044121840894 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.3777897246686755 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.25714862989687987 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.33187729423141027 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.16493399805627715 + } + } + } +} \ No newline at end of file diff --git a/static/eval_results/Default/MiniCPM_v2.6/task_results.json b/static/eval_results/Default/MiniCPM_v2.6/task_results.json new file mode 100644 index 0000000000000000000000000000000000000000..aea366727ae846ca60d8703baaa0c037022a920f --- /dev/null +++ b/static/eval_results/Default/MiniCPM_v2.6/task_results.json @@ -0,0 +1,7756 @@ +[ + { + "name": "functionality_matching_in_different_objects", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "location_vqa", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "paper_review_rating", + "score": 0.6265087790877971, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "numerical_data", + "num_input": "4-5 images" + }, + { + "name": "scibench_calculus_wo_solution", + "score": 0.04081632653061224, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_find_odd_one_out", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "google_streetview_circle_sorting", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "planning_screenshot_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "relative_depth_of_different_points", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "recipe_image_ordering", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "rocks_samples_identify", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "sign_language", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "2d_image_jigsaw_puzzle_easy", + "score": 0.1619047619047619, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "booking_web_recommendation", + "score": 0.17740929705215416, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "signage_navigation", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "road_map_find_highway_between_two_place", + "score": 0.17647058823529413, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "video_eval_dynamic_pref", + "score": 0.4375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "logical_reasoning_fit_pattern", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "medical_keywords_based_retrieval_non_radiology", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "entertainment_web_game_style", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "game_platform_support_identification", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ascii_art_understanding", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "scibench_fundamental_wo_solution", + "score": 0.02040816326530612, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "vln_hindi_next_step", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "GUI_Act_Mobile_swipe", + "score": 0.5697307134254179, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_grounding_temporal", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "extract_webpage_headline", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "distinguish_ai_generated_image", + "score": 0.6842105263157895, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_nested_squares", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_cell_recognition", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "booking_web_rating", + "score": 0.8928571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "video_eval_visual_pref", + "score": 0.4375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "medical_parasite_detection", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "medical_image_artifacts_indentification", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_tyreworld", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_overlapped_circle", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "monthly_weather_days_count", + "score": 0.16666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_visualization_output_understanding", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 10, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "CLEVRER_physics", + "score": 0.3, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Mobile_tap", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "circuit_diagram_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "music_sheet_note_count", + "score": 0.17647058823529413, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "chess_find_legal_moves", + "score": 0.025571294374413373, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_translation_advanced", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "planning_visual_storage", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "multiview_reasoning_camera_moving", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "medical_counting_lymphocytes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_visual_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "code_execution", + "score": 0.125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_sygyzy_endgames", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "video_action_recognition", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "structured_output", + "num_input": "video" + }, + { + "name": "music_sheet_author", + "score": 0.125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "rocks_samples_compare", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "comic_page_ordering", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "hashtag_recommendation", + "score": 0.8214285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "multilingual_news_qa", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "paper_review_acceptance", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "counting_multi_image", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "photoshop_operation", + "score": 0.15476190476190474, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "vln_identify_robot", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "9-image or more" + }, + { + "name": "geometry_reasoning_grid", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_visual_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "annoying_word_search", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "healthcare_info_judgement", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Ethical and Safety Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "webpage_code_understanding", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "font_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "relative_reflectance_of_different_regions", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "vln_tegulu_next_step", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "medical_abdomen_endscopy_organ_recognition", + "score": 0.023809523809523808, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "calendar_schedule_suggestion", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "mensa_iq_test", + "score": 0.17401960784313725, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "google_streetview_direction_understanding", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "planning_visual_blocksworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "medical_multi_organ_segmentation_rater", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "product_ocr_qa", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "counting", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_blocksworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "semantic_matching_of_two_images", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "MMMU_pro_exam_screenshot", + "score": 0.13131313131313133, + "eval_type": "rule", + "num_demo": 1, + "num_query": 99, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "chinese_idiom_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_visual_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "ishihara_test", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "pokemon_3D_recognition", + "score": 0.05, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "Ad_count_detection", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "code_translation_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "visual_correspondance_in_two_images", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "video_eval_factual_pref", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "number_comparison", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "code_translation_Python", + "score": 0.0625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "song_title_identification_from_lyrics", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "paper_vqa", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_translation_easy", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "TRANCE_physics_reasoning_event", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "polygon_interior_angles", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "play_go_capture_stone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "music_sheet_name", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "medical_content_based_retrieval_radiology", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "mindmap_elements_parsing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_segments_reordering", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "structured_output", + "num_input": "video" + }, + { + "name": "planning_screenshot_storage", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "rebus", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 23, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "music_sheet_format_QA", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "autonomous_driving_scene_analysis", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "worldle", + "score": 0.12570039901308083, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "transit_map_intersection_points", + "score": 0.017857142857142856, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "mahjong", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "Movie_retrieval_by_actor", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "music_sheet_sentiment", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "code_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "LaTeX_complex_formula_convertion", + "score": 0.17647058823529413, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_intent_recognition", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "GUI_Act_Web_Single", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "image_translation_en2cn", + "score": 0.08070309345387555, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "actor_recognition_in_Movie", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "remaining_playback_time_calculation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ancient_map_understanding", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "media_recommend_solutions_stackoverflow", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "vln_english_next_step", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "waldo", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "medical_polyp_segmentation_single_object_rater", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "planning_screenshot_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "soccer_offside", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "code_match_problem", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "recover_masked_word_in_figure", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "top_video_creator_identification", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "web_action_grounding", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "code_solution_compare", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "interpret_force_perspective_illusion", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "orchestra_score_recognition", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_grounding_spatial", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "logical_reasoning_2d_folding", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "GUI_Act_Web_Multi", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "llavaguard", + "score": 0.5357142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "google_streetview_circle_reasoning", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "TV_show_retrieval_by_character", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "icon_arithmetic_puzzle", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "vln_identify_location", + "score": 0.08484848484848483, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "planning_visual_grippers", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "geometry_reasoning_count_line_intersections", + "score": 0.17857142857142858, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_abdomen_MRI_organ_recognition", + "score": 0.09523809523809523, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "web_action_prediction", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "knowledge_graph_understanding", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "knowledge_sign_recognition", + "score": 0.1111111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "Forensic_Detection_of_different_images", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "position_relationship", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "Bongard_Problem", + "score": 0.23684210526315788, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_circled_letter", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_2D_views_of_3D_shapes", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "dish_ingredient_match", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "media_homepage_profile", + "score": 0.039560439560439566, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "video_camera_motion_description", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "exact_text", + "num_input": "video" + }, + { + "name": "google_streetview_line_sorting", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "move_pos_to_pos_hanoi_4_pole", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "flowchart_code_generation", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "topological_sort", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "google_streetview_line_reasoning", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "TRANCE_physics_reasoning_view", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "planning_screenshot_grippers", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_retrieval_given_surgeon_activity", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Videos", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "TRANCE_physics_reasoning_basic", + "score": 0.35294117647058826, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "weather_map_climate_type_temperature_parsing", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "highest_discount_game_price_identification", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "media_QA_web_stackoverflow", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "newspaper_ocr_in_query_box", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "electricity_plot_future_prediction", + "score": 0.3116385321052631, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "coco_person_detection", + "score": 0.25003536329619036, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geometry_length", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "weather_info_parsing", + "score": 0.2420634920634921, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "av_view_identification", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "physical_property_reasoning", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "face_keypoint_detection", + "score": 0.6509561973369343, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_maxflow", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "av_vehicle_multiview_counting", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "6-8 images" + }, + { + "name": "geometry_transformation", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "chess_puzzle_single_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ili_ratio_future_prediction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geometry_analytic", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "mvsa_sentiment_classification", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "graph_theory", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "clevrer_video_moving_object_property_recognition", + "score": 0.6875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "clevrer_video_moving_object_count", + "score": 0.38095238095238093, + "eval_type": "rule", + "num_demo": 1, + "num_query": 21, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "video" + }, + { + "name": "sta_action_localization_video", + "score": 0.3125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "code_programming_test_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "graph_hamiltonian_cycle", + "score": 0.431547619047619, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "vizwiz_quality_accessment_for_blind", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "funsd_document_qa", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "single_person_pose_estimation", + "score": 0.04821164701888182, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "deciphering_oracle_bone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "license_plate_recognition", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "science_molecule_chemistry", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "snli_ve_visual_entailment", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geometry_solid", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "music_info_parsing", + "score": 0.3482142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_area", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "landmark_check_two_images", + "score": 0.17777777777777776, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "dvqa", + "score": 0.631578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "cheapest_flight_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "code_programming_extremely_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Planning and Decision Making" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "widerface_face_count_and_event_classification", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "movie_info_retrieval", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "long_string_letter_recognition", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chess_winner_identification", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "math_convexity_value_estimation", + "score": 0.2205833035621225, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "multi_load_type_prediction_from_plot", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "ti_fused_vqa_math", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "human_relationship_reasoning", + "score": 0.9375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Commonsense and Social Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "vibe_eval_short_phrase", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "av_multicamera_tracking_predict_bbox", + "score": 0.034222739980969856, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "9-image or more" + }, + { + "name": "perception_test_video_character_order", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "question_solution_solving", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "game_info_parsing", + "score": 0.753246753246753, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "top_rated_hotel_identification", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "nextqa_mc", + "score": 0.9473684210526315, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "stock_price_future_prediction", + "score": 0.4511428571428572, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "algebra", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "graph_shortest_path_planar", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "action_sequence", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "9-image or more" + }, + { + "name": "map_diagram_qa", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "places365_similar_scene_retrieval", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "graph_shortest_path_kamada_kawai", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "star_object_interaction_video", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "graph_chordless_cycle", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "waybill_number_sequence_extraction", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "iconqa", + "score": 0.3684210526315789, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "graph_connectivity", + "score": 0.15, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "mnist_pattern", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "clevrer_moving_direction_video", + "score": 0.0625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "egocentric_analysis_single_image", + "score": 0.7777777777777778, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "image_style_recognition", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "weather_info_retrieval", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "geometry_descriptive", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "IAM_line_ocr_and_locate", + "score": 0.4546989887491169, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "electricity_load_estimate_plot", + "score": 0.4178571428571428, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "insect_order_classification", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "perception_test_object_shuffle_video", + "score": 0.375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "muma_theory_of_mind_social_goal", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "ti_fused_vqa_biology", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "youtube_video_info_parsing", + "score": 0.4404761904761905, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "muma_theory_of_mind_belief_of_goal", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "quizlet_question_solving", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_chemistry", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "clevr_arithmetic", + "score": 0.10526315789473684, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "exchange_rate_estimate_plot", + "score": 0.8529071428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "multilingual_movie_info_parsing", + "score": 0.09183673469387754, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "hotel_booking_confirmation_parsing", + "score": 0.3214285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_programming_test_easy", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 24, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "brand_logo_recognition_and_elaboration", + "score": 0.24, + "eval_type": "rule", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "arc_agi", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "movie_info_parsing", + "score": 0.41964285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "3d_indoor_scene_text_bbox_prediction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "video_to_camera_trajectory_retrieval", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "TV_show_info_parsing", + "score": 0.4603174603174602, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "stock_info_parsing", + "score": 0.7184873949579832, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "math_parity", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "visualdial_visual_dialog_image_guessing", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "famous_building_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "egocentric_spatial_reasoning", + "score": 0.2222222222222222, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "nlvr2_two_image_compare_qa", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "vlnqa_egocentric_navigation_video", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "graph_isomorphism", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "landmark_recognition_and_qa", + "score": 0.24444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "face_identity_matching", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "4-5 images" + }, + { + "name": "music_info_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "clevrer_object_existence_video", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "cam_traj_to_video_selection", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "video_content_reasoning", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "3d_fragments_understanding", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "ti_fused_vqa_physics", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "pmc_vqa_medical_image_qa", + "score": 0.47368421052631576, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "long_string_number_recognition", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "electricity_future_prediction_from_table", + "score": 0.3010526315789474, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "places365_scene_type_classification", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_error_line_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "code_programming_test_advanced", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "super_clevr", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "animal_pose_estimation", + "score": 0.21324372091628846, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "logo2k_same_type_logo_retrieval", + "score": 0.17857142857142858, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "science_basic_physics", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "math_breakpoint", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "newspaper_page_parse_and_count", + "score": 0.19999999999999998, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "multilingual_game_info_parsing", + "score": 0.39285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "signboard_identification", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "chart_vqa", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "action_prediction", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "average_humidity_estimate_plot", + "score": 0.05599999999999996, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "photo_sharing_image_retrieval", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "humor_understand_caption_match", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "game_info_retrieval", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "emotion_recognition", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "6-8 images" + }, + { + "name": "figureqa", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "av_human_multiview_counting", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "6-8 images" + }, + { + "name": "perception_test_video_action_count", + "score": 0.1875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "video" + }, + { + "name": "stock_info_retrieval", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "coco_object_detection_by_query_property", + "score": 0.3512772395965795, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "graph_hamiltonian_path", + "score": 0.23809523809523808, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "coco_ood_global_image_retrieval_by_query_property", + "score": 0.42993197278911566, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "traffic_future_prediction_from_line_plot", + "score": 0.2630526315789474, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_crushing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_equality", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_checkmate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_papers", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_indian_celebrity", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_chinese_celebrity", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_famous_treaty", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "reward_models_T2I_reward", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "reward_models_I2T_reward", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "crossword_mini_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_motion_matching_real_3D", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "video_motion_matching_3D_real", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "number_puzzle_sudoku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "number_puzzle_kakuro_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "panel_images_single_question", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "panel_images_multi_question", + "score": 0.16666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "table_understanding_complex_question_answering", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "table_understanding_fact_verification", + "score": 0.5476190476190477, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_position_images", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_xor_images", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_repeat_length", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_contain_length", + "score": 0.6, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_multi_contain_position_only", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_length", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_contain_images", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_multi_contain_repeat", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_position_length", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_multi_contain_repeat_position_only_length", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "maze_2d_8x8", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_PolitiFact", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_GossipCop", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Object Recognition and Classification", + "Scene and Event Understanding", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_HatefulMemes", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Memotion", + "score": 0.6470588235294118, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_latex", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_csv", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_markdown", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_experience_plain", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_employer_plain", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_article_authors", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_article_journal", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_skill_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_text_latex", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_equation", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_html", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_school_plain", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "pictionary_doodle_guess", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_chinese_food_img2en", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_cartoon_drawing_guess", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_genai_output_chinese", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_skribbl_io", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "counterfactual_arithmetic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_youtube", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_iphone_settings", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_notes", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_word", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_alipay", + "score": 0.29411764705882354, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_twitter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_leetcode", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_tiktok", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_excel", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_instagram", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_ppt", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_zoom", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_amazon", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_layout_understanding_youtube", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_iphone_settings", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_word", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_alipay", + "score": 0.17647058823529413, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_twitter", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_leetcode", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_tiktok", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_excel", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_instagram", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_ppt", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_zoom", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_amazon", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_attribute_edit", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_out_of_context", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_background_change", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_entity_replace", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_clip_stable_diffusion_generate", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_style", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_veracity", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_swap", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_blogpost", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_homepage", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_publication", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "autorater_3d_model_texturing", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Metrics", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_subject", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_unmask", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_artifact_reason", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "autorater_mask", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_aesthetics", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_motion_guided_editing", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "autorater_artifact", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "autorater_control", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_semantics", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "shape_composition_shapes", + "score": 0.21284013605442173, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "shape_composition_colours", + "score": 0.10541383219954649, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_math_TheoremQA", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_MATH", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "ball_cup_swap_3", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "poetry_petrarchian_sonnet_optional_meter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_shakespearean_sonnet", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_custom_rhyming_scheme", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_haiku", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic_alliteration", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_limerick", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "geographic_remote_sensing_land_cover", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "visual_prediction_rater_novel_view_synthesis", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_semantic_segmentation", + "score": 0.08333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_panoptic_segmentation", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_surface_normal_estimation", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_openable_part_segmentation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_plane_segmentation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_depth_estimation", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_3d_assembled_quality_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "symbolic_graphics_programs_computer_aided_design", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "symbolic_graphics_programs_scalable_vector_graphics", + "score": 0.05555555555555555, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_depth", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_relation", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_distance", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_count", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_africa", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_asia", + "score": 0.042857142857142864, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_americas", + "score": 0.042857142857142864, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_europe", + "score": 0.042857142857142864, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "medical_blood_vessels_recognition", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "character_recognition_in_TV_shows", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "cultural_vqa", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "tqa_textbook_qa", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_output_result", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "kvqa_knowledge_aware_qa", + "score": 0.21052631578947367, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "painting_QA", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "MMMU_physics_chemistry_MCQ", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "realworld_qa_en2cn", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "arxiv_vqa", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "code_add_tag", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "3d_indoor_scene_text_bbox_selection", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "table_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "handwritten_math_expression_extraction", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_notes", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ascii_art_30", + "score": 0.14285714285714285, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "table2latex_complex", + "score": 0.4444444444444444, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "meme_explain", + "score": 0.45, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "sceneqa_scene_transition_video", + "score": 0.24999999999999994, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "video_summary", + "score": 0.5000000000000001, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "funqa_unexpected_action_magic_video", + "score": 0.48000000000000004, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "paper_review_writing", + "score": 0.5066666666666667, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "4-5 images" + }, + { + "name": "activitynetqa", + "score": 0.38947368421052636, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "scibench_w_solution_open_ended", + "score": 0.07, + "eval_type": "llm", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "generated_video_artifacts", + "score": 0.1, + "eval_type": "llm", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "funny_image_title", + "score": 0.4499999999999999, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "wikihow_complex_task_completion", + "score": 0.5444444444444445, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "9-image or more" + }, + { + "name": "video_detail_description", + "score": 0.4473684210526316, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "funqa_unexpected_action_creative_video", + "score": 0.2533333333333333, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "guess_image_generation_prompt", + "score": 0.7842105263157896, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "traffic_accident_analysis", + "score": 0.12142857142857144, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_qa", + "score": 0.6642857142857144, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "image_humor_understanding", + "score": 0.5241379310344827, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "vibe-eval", + "score": 0.5785714285714285, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "nextqa_oe", + "score": 0.31578947368421056, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "graph_interpretation", + "score": 0.5344827586206897, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "science_figure_explanation", + "score": 0.396551724137931, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "ocrqa", + "score": 0.6137931034482759, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_short_title", + "score": 0.6142857142857142, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "video2notes", + "score": 0.4357142857142858, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "electrocardiogram", + "score": 0.17142857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "GUI_Chat_Easy", + "score": 0.7000000000000001, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "doc_vqa", + "score": 0.7187499999999999, + "eval_type": "llm", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "4-5 images" + }, + { + "name": "docci_image_description_long", + "score": 0.65, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "humor_explanation", + "score": 0.4799999999999999, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_content_follow_up", + "score": 0.3857142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Planning and Decision Making" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "GUI_Chat_Hard", + "score": 0.48387096774193555, + "eval_type": "llm", + "num_demo": 1, + "num_query": 31, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "iq_test", + "score": 0.21379310344827582, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "unusual_images", + "score": 0.5172413793103448, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "tweets_captioning", + "score": 0.4714285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "art_explanation", + "score": 0.4068965517241379, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bar_chart_interpretation", + "score": 0.3586206896551724, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funqa_unexpected_action_humor_video", + "score": 0.2533333333333333, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "figurative_speech_explanation", + "score": 0.6344827586206898, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "defeasible_reasoning", + "score": 0.5241379310344827, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "image_captioning_with_additional_requirements", + "score": 0.7928571428571429, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "visualization_with_code", + "score": 0.15714285714285717, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "table_understanding_fetaqa", + "score": 0.39285714285714285, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_jailbreak", + "score": 0.43499999999999994, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Text Recognition (OCR)", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_celebrity", + "score": 0.64, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_captcha", + "score": 0.10000000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visual_order_B", + "score": 0.905263157894737, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "red_teaming_politics", + "score": 0.68, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visual_order_A", + "score": 0.8842105263157897, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "red_teaming_racial", + "score": 0.7600000000000001, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visualmisleading", + "score": 0.8947368421052632, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_worldclass", + "score": 0.05714285714285715, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_advanced", + "score": 0.1785714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_expert", + "score": 0.3214285714285715, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Spanish", + "score": 0.1, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Russian", + "score": 0.09285714285714285, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Arabic", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Chinese", + "score": 0.2, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_French", + "score": 0.21428571428571425, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Spanish", + "score": 0.35000000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_English", + "score": 0.37142857142857144, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Russian", + "score": 0.14285714285714285, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Arabic", + "score": 0.26428571428571435, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Japanese", + "score": 0.2642857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_French", + "score": 0.34285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + } +] \ No newline at end of file diff --git a/static/eval_results/Default/NVLM/summary_results.json b/static/eval_results/Default/NVLM/summary_results.json new file mode 100644 index 0000000000000000000000000000000000000000..1f5960546368b4fa15bc67524b10eebb5393c2ca --- /dev/null +++ b/static/eval_results/Default/NVLM/summary_results.json @@ -0,0 +1,251 @@ +{ + "model_summary": { + "core": { + "num_eval_tasks": 440, + "num_eval_samples": 6539, + "macro_mean_score": 0.21589726765847422, + "micro_mean_score": 0.21406043849932396 + }, + "open": { + "num_eval_tasks": 65, + "num_eval_samples": 1163, + "macro_mean_score": 0.3478114310231307, + "micro_mean_score": 0.3947549441100602 + }, + "overall_score": 0.23287631838857856 + }, + "keyword_stats": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.21591473223174515 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.27426258729618225 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.284874072963892 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.2134087963800149 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.2525993645909815 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.4029543142569604 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.4317142857142857 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.2442484196551863 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.1424318574406695 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.046798309600525674 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.19655048708297065 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.18621338396242557 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.2922667531642391 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.0 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.3447361496776569 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.29674507895195534 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.09716389574493003 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.19684666506287793 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.2199792859352912 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.25164831125437204 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.2396831363622878 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.3215948035793096 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.1853526865291571 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.0 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.0 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.3352056263801705 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.0 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.038244047619047615 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.2100484481849172 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.15704252277801936 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.06688589450465973 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.2292747206409446 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.2689383226748064 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.18857142857142856 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.23682040748983965 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.3656649917873737 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.26866914106442213 + } + } + } +} \ No newline at end of file diff --git a/static/eval_results/Default/NVLM/task_results.json b/static/eval_results/Default/NVLM/task_results.json new file mode 100644 index 0000000000000000000000000000000000000000..38d4ee6da1a106eb733002cbbba4381cf1926f9b --- /dev/null +++ b/static/eval_results/Default/NVLM/task_results.json @@ -0,0 +1,7756 @@ +[ + { + "name": "face_identity_matching", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "4-5 images" + }, + { + "name": "IAM_line_ocr_and_locate", + "score": 0.03539274548424487, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "science_molecule_chemistry", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "weather_info_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "signboard_identification", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "av_multicamera_tracking_predict_bbox", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "9-image or more" + }, + { + "name": "funsd_document_qa", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "physical_property_reasoning", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "geometry_area", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geometry_analytic", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "3d_indoor_scene_text_bbox_selection", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "map_diagram_qa", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "clevr_arithmetic", + "score": 0.21052631578947367, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "graph_connectivity", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "visualdial_visual_dialog_image_guessing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "graph_isomorphism", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "clevrer_object_existence_video", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "question_solution_solving", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "snli_ve_visual_entailment", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_chemistry", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ili_ratio_future_prediction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "image_style_recognition", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "multilingual_movie_info_parsing", + "score": 0.2755102040816326, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "star_object_interaction_video", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "video_content_reasoning", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "geometry_transformation", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "places365_similar_scene_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "vibe_eval_short_phrase", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "cam_traj_to_video_selection", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "math_breakpoint", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "photo_sharing_image_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "perception_test_video_character_order", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "coco_ood_global_image_retrieval_by_query_property", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "arc_agi", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "vizwiz_quality_accessment_for_blind", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "av_human_multiview_counting", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "6-8 images" + }, + { + "name": "waybill_number_sequence_extraction", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "emotion_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "6-8 images" + }, + { + "name": "brand_logo_recognition_and_elaboration", + "score": 0.68, + "eval_type": "rule", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "license_plate_recognition", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chess_puzzle_single_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_programming_test_easy", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 24, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_winner_identification", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chart_vqa", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "math_convexity_value_estimation", + "score": 0.39920016568805217, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "3d_indoor_scene_text_bbox_prediction", + "score": 0.0688028531942305, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "long_string_letter_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "dvqa", + "score": 0.47368421052631576, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "egocentric_spatial_reasoning", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "music_info_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "quizlet_question_solving", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "coco_person_detection", + "score": 0.1841206696719848, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "clevrer_moving_direction_video", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "graph_maxflow", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "tqa_textbook_qa", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "game_info_parsing", + "score": 0.7012987012987013, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_hamiltonian_cycle", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "av_vehicle_multiview_counting", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "6-8 images" + }, + { + "name": "animal_pose_estimation", + "score": 0.13760548426771885, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "humor_understand_caption_match", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "table_understanding", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "super_clevr", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "average_humidity_estimate_plot", + "score": 0.5240000000000001, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "nlvr2_two_image_compare_qa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "sta_action_localization_video", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "face_keypoint_detection", + "score": 0.37558289757353336, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "insect_order_classification", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "deciphering_oracle_bone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "places365_scene_type_classification", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "science_basic_physics", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "perception_test_object_shuffle_video", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "long_string_number_recognition", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_programming_test_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "handwritten_math_expression_extraction", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "video_to_camera_trajectory_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "cheapest_flight_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "newspaper_ocr_in_query_box", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "stock_info_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "graph_theory", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "landmark_recognition_and_qa", + "score": 0.35555555555555546, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "muma_theory_of_mind_social_goal", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "clevrer_video_moving_object_property_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "action_sequence", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "9-image or more" + }, + { + "name": "game_info_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "av_view_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "code_programming_test_advanced", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "muma_theory_of_mind_belief_of_goal", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "code_programming_extremely_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Planning and Decision Making" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "geometry_solid", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "top_rated_hotel_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "stock_info_parsing", + "score": 0.9075630252100841, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "pmc_vqa_medical_image_qa", + "score": 0.47368421052631576, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "3d_fragments_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "electricity_future_prediction_from_table", + "score": 0.43610526315789483, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "TV_show_info_parsing", + "score": 0.7460317460317459, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "coco_object_detection_by_query_property", + "score": 0.4075492509529021, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "vlnqa_egocentric_navigation_video", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "landmark_check_two_images", + "score": 0.6222222222222221, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "code_output_result", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "figureqa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_biology", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "traffic_future_prediction_from_line_plot", + "score": 0.541578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "mvsa_sentiment_classification", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "stock_price_future_prediction", + "score": 0.6185714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "kvqa_knowledge_aware_qa", + "score": 0.3157894736842105, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multi_load_type_prediction_from_plot", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "hotel_booking_confirmation_parsing", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_shortest_path_planar", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "action_prediction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "code_error_line_identification", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "math_parity", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "famous_building_recognition", + "score": 0.75, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "multilingual_game_info_parsing", + "score": 0.17857142857142858, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "weather_info_parsing", + "score": 0.8571428571428574, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_shortest_path_kamada_kawai", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "widerface_face_count_and_event_classification", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "nextqa_mc", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "ti_fused_vqa_physics", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "movie_info_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "graph_chordless_cycle", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "clevrer_video_moving_object_count", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 21, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "video" + }, + { + "name": "geometry_length", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "algebra", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "human_relationship_reasoning", + "score": 0.3125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Commonsense and Social Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "exchange_rate_estimate_plot", + "score": 0.8056142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "electricity_load_estimate_plot", + "score": 0.343, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "logo2k_same_type_logo_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "single_person_pose_estimation", + "score": 0.013955337071466666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_math", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "electricity_plot_future_prediction", + "score": 0.3731789473684211, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "graph_hamiltonian_path", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "iconqa", + "score": 0.2631578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "movie_info_parsing", + "score": 0.625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "youtube_video_info_parsing", + "score": 0.6071428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "egocentric_analysis_single_image", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "perception_test_video_action_count", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "video" + }, + { + "name": "music_info_parsing", + "score": 0.375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "mnist_pattern", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "newspaper_page_parse_and_count", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_descriptive", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_relation", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_depth", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_distance", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_count", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geographic_remote_sensing_land_cover", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "symbolic_graphics_programs_computer_aided_design", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "symbolic_graphics_programs_scalable_vector_graphics", + "score": 0.1111111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_asia", + "score": 0.11428571428571428, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_americas", + "score": 0.12857142857142856, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_europe", + "score": 0.15714285714285717, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_africa", + "score": 0.028571428571428574, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "visual_prediction_rater_plane_segmentation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_novel_view_synthesis", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_3d_assembled_quality_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "visual_prediction_rater_semantic_segmentation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_panoptic_segmentation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_surface_normal_estimation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_depth_estimation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_openable_part_segmentation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "medical_abdomen_MRI_organ_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "visual_correspondance_in_two_images", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "google_streetview_circle_reasoning", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "vln_tegulu_next_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "flowchart_code_generation", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "video_eval_factual_pref", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "medical_polyp_segmentation_single_object_rater", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "recipe_image_ordering", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "healthcare_info_judgement", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Ethical and Safety Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "relative_depth_of_different_points", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "planning_visual_storage", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_visualization_output_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 10, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "Ad_count_detection", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "TRANCE_physics_reasoning_view", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "video_grounding_temporal", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "chess_sygyzy_endgames", + "score": 0.05714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_visual_blocksworld", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "web_action_grounding", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "position_relationship", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Web_Multi", + "score": 0.1820853399616799, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "webpage_code_understanding", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "vln_english_next_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "logical_reasoning_2d_folding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "extract_webpage_headline", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_translation_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "music_sheet_note_count", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_circled_letter", + "score": 0.32142857142857145, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_find_odd_one_out", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "paper_vqa", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "medical_abdomen_endscopy_organ_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "worldle", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "medical_keywords_based_retrieval_non_radiology", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "remaining_playback_time_calculation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "sign_language", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "video_camera_motion_description", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "exact_text", + "num_input": "video" + }, + { + "name": "video_segments_reordering", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "structured_output", + "num_input": "video" + }, + { + "name": "interpret_force_perspective_illusion", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Web_Single", + "score": 0.009223028391365047, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "hashtag_recommendation", + "score": 0.9, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_tyreworld", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_grounding_spatial", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "weather_map_climate_type_temperature_parsing", + "score": 0.5357142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_match_problem", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "music_sheet_sentiment", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "scibench_calculus_wo_solution", + "score": 0.1836734693877551, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "MMMU_physics_chemistry_MCQ", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_eval_visual_pref", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "image_translation_en2cn", + "score": 0.2104138975306737, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "game_platform_support_identification", + "score": 0.17857142857142858, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "music_sheet_format_QA", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "cultural_vqa", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "CLEVRER_physics", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "location_vqa", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "semantic_matching_of_two_images", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "planning_visual_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "geometry_reasoning_nested_squares", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "circuit_diagram_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "scibench_fundamental_wo_solution", + "score": 0.061224489795918366, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_visual_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "planning_visual_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "code_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "medical_cell_recognition", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_blocksworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_storage", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "distinguish_ai_generated_image", + "score": 0.2631578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_count_line_intersections", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_grippers", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "booking_web_recommendation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "photoshop_operation", + "score": 0.014285714285714287, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "llavaguard", + "score": 0.6071428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "media_homepage_profile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "vln_hindi_next_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "media_recommend_solutions_stackoverflow", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "dish_ingredient_match", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "medical_retrieval_given_surgeon_activity", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Videos", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "google_streetview_direction_understanding", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "music_sheet_name", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "code_translation_easy", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "ancient_map_understanding", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "mindmap_elements_parsing", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_add_tag", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "autonomous_driving_scene_analysis", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_2D_views_of_3D_shapes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "paper_review_acceptance", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "rocks_samples_compare", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "realworld_qa_en2cn", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "google_streetview_line_reasoning", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "orchestra_score_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "highest_discount_game_price_identification", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "video_intent_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "counting", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "media_QA_web_stackoverflow", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_solution_compare", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "soccer_offside", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "2d_image_jigsaw_puzzle_easy", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "medical_content_based_retrieval_radiology", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "song_title_identification_from_lyrics", + "score": 0.32142857142857145, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MMMU_pro_exam_screenshot", + "score": 0.21212121212121213, + "eval_type": "rule", + "num_demo": 1, + "num_query": 99, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "multiview_reasoning_camera_moving", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "medical_multi_organ_segmentation_rater", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "knowledge_graph_understanding", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "rocks_samples_identify", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "mensa_iq_test", + "score": 0.29705882352941176, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "play_go_capture_stone", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "medical_blood_vessels_recognition", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "booking_web_rating", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "web_action_prediction", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "arxiv_vqa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "top_video_creator_identification", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_visual_grippers", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "annoying_word_search", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Mobile_tap", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "comic_page_ordering", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "geometry_reasoning_grid", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "TRANCE_physics_reasoning_event", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "planning_screenshot_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Mobile_swipe", + "score": 0.3997587534660852, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "recover_masked_word_in_figure", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "character_recognition_in_TV_shows", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "vln_identify_robot", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "9-image or more" + }, + { + "name": "multilingual_news_qa", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "move_pos_to_pos_hanoi_4_pole", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "relative_reflectance_of_different_regions", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "counting_multi_image", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "code_translation_advanced", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "medical_image_artifacts_indentification", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "signage_navigation", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "transit_map_intersection_points", + "score": 0.13690476190476192, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_execution", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "icon_arithmetic_puzzle", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_overlapped_circle", + "score": 0.4642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "google_streetview_circle_sorting", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "chinese_idiom_recognition", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "knowledge_sign_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "monthly_weather_days_count", + "score": 0.2380952380952381, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "calendar_schedule_suggestion", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "painting_QA", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "music_sheet_author", + "score": 0.0625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "ascii_art_understanding", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "chess_find_legal_moves", + "score": 0.007137000066986845, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "topological_sort", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "entertainment_web_game_style", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_action_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "structured_output", + "num_input": "video" + }, + { + "name": "functionality_matching_in_different_objects", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "Movie_retrieval_by_actor", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "TV_show_retrieval_by_character", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "Forensic_Detection_of_different_images", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "polygon_interior_angles", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "pokemon_3D_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "mahjong", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "font_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "paper_review_rating", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "numerical_data", + "num_input": "4-5 images" + }, + { + "name": "number_comparison", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "medical_counting_lymphocytes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "actor_recognition_in_Movie", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "TRANCE_physics_reasoning_basic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "waldo", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "google_streetview_line_sorting", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "video_eval_dynamic_pref", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "product_ocr_qa", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "Bongard_Problem", + "score": 0.17543859649122806, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_translation_Python", + "score": 0.08333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "logical_reasoning_fit_pattern", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "road_map_find_highway_between_two_place", + "score": 0.47058823529411764, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ishihara_test", + "score": 0.21428571428571433, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_parasite_detection", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "LaTeX_complex_formula_convertion", + "score": 0.23529411764705882, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "vln_identify_location", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "rebus", + "score": 0.08695652173913043, + "eval_type": "rule", + "num_demo": 1, + "num_query": 23, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "constrained_generation_multi_contain_repeat_position_only_length", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_multi_contain_repeat", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_position_length", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_xor_images", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_position_images", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_repeat_length", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_contain_length", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_multi_contain_position_only", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_contain_images", + "score": 0.8, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_length", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_equality", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_crushing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_checkmate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "panel_images_single_question", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "panel_images_multi_question", + "score": 0.619047619047619, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_tiktok", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_notes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_alipay", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_amazon", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_instagram", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_ppt", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_leetcode", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_twitter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_zoom", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_iphone_settings", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_youtube", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_excel", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_word", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "crossword_mini_5x5", + "score": 0.27142857142857146, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MMSoc_HatefulMemes", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_GossipCop", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Object Recognition and Classification", + "Scene and Event Understanding", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_PolitiFact", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Memotion", + "score": 0.6705882352941177, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "reward_models_I2T_reward", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "reward_models_T2I_reward", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "ocr_math_MATH", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "ocr_math_TheoremQA", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "counterfactual_arithmetic", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_homepage", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_publication", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_blogpost", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "table_understanding_complex_question_answering", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "table_understanding_fact_verification", + "score": 0.5952380952380951, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_swap", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_style", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_clip_stable_diffusion_generate", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_veracity", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_attribute_edit", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_entity_replace", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_out_of_context", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_background_change", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "video_motion_matching_3D_real", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "video_motion_matching_real_3D", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "number_puzzle_kakuro_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "number_puzzle_sudoku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ball_cup_swap_3", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "maze_2d_8x8", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "shape_composition_shapes", + "score": 0.1364795918367347, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "shape_composition_colours", + "score": 0.1446995464852608, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_chinese_celebrity", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_indian_celebrity", + "score": 0.17857142857142858, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_papers", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_famous_treaty", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_csv", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_school_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_text_latex", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_skill_plain", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_experience_plain", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_article_authors", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_markdown", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_html", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_employer_plain", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_equation", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_article_journal", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_latex", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "autorater_subject", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_artifact", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "autorater_control", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_aesthetics", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_mask", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_artifact_reason", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "autorater_semantics", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_motion_guided_editing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "autorater_3d_model_texturing", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Metrics", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_unmask", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "poetry_haiku", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_limerick", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_petrarchian_sonnet_optional_meter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_custom_rhyming_scheme", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic_alliteration", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_shakespearean_sonnet", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "pictionary_cartoon_drawing_guess", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_doodle_guess", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_skribbl_io", + "score": 0.05, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_genai_output_chinese", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_chinese_food_img2en", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_tiktok", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_notes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_alipay", + "score": 0.058823529411764705, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_amazon", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_instagram", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_ppt", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_leetcode", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_twitter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_zoom", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_iphone_settings", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_youtube", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_excel", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_word", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "scibench_w_solution_open_ended", + "score": 0.21600000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "electrocardiogram", + "score": 0.24285714285714288, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funny_image_title", + "score": 0.6, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "image_captioning_with_additional_requirements", + "score": 0.7214285714285715, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "meme_explain", + "score": 0.6214285714285713, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "image_humor_understanding", + "score": 0.7206896551724138, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "doc_vqa", + "score": 0.38125, + "eval_type": "llm", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "4-5 images" + }, + { + "name": "generated_video_artifacts", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "sceneqa_scene_transition_video", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "nextqa_oe", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "tweets_captioning", + "score": 0.4928571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "defeasible_reasoning", + "score": 0.7000000000000002, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "vibe-eval", + "score": 0.6571428571428573, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "ocrqa", + "score": 0.7068965517241379, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_content_follow_up", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Planning and Decision Making" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "figurative_speech_explanation", + "score": 0.7482758620689655, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "wikihow_complex_task_completion", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "9-image or more" + }, + { + "name": "docci_image_description_long", + "score": 0.65, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_detail_description", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "video2notes", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "funqa_unexpected_action_magic_video", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "video_qa", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "bar_chart_interpretation", + "score": 0.5241379310344828, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_summary", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "iq_test", + "score": 0.2689655172413793, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funqa_unexpected_action_creative_video", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "unusual_images", + "score": 0.6586206896551723, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "GUI_Chat_Hard", + "score": 0.529032258064516, + "eval_type": "llm", + "num_demo": 1, + "num_query": 31, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "graph_interpretation", + "score": 0.6620689655172413, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "traffic_accident_analysis", + "score": 0.6000000000000002, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "humor_explanation", + "score": 0.6599999999999999, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funqa_unexpected_action_humor_video", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "video_short_title", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "GUI_Chat_Easy", + "score": 0.7307692307692307, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "table2latex_complex", + "score": 0.4444444444444444, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "activitynetqa", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "visualization_with_code", + "score": 0.23571428571428574, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "science_figure_explanation", + "score": 0.6517241379310345, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "guess_image_generation_prompt", + "score": 0.805263157894737, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "art_explanation", + "score": 0.4689655172413794, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "paper_review_writing", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "4-5 images" + }, + { + "name": "bridge_strategies_worldclass", + "score": 0.2, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_expert", + "score": 0.13571428571428573, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_advanced", + "score": 0.1928571428571429, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Spanish", + "score": 0.35, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Arabic", + "score": 0.36428571428571427, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Japanese", + "score": 0.19285714285714287, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Russian", + "score": 0.1642857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_English", + "score": 0.45, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_French", + "score": 0.3, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "table_understanding_fetaqa", + "score": 0.5, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visual_order_B", + "score": 0.905263157894737, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "red_teaming_racial", + "score": 0.72, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_captcha", + "score": 0.10000000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_celebrity", + "score": 0.535, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visual_order_A", + "score": 0.8789473684210528, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "red_teaming_politics", + "score": 0.6699999999999999, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visualmisleading", + "score": 0.9, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_jailbreak", + "score": 0.3499999999999999, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Text Recognition (OCR)", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Spanish", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Arabic", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Russian", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_French", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Chinese", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "ascii_art_30", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + } +] \ No newline at end of file diff --git a/static/eval_results/Default/Phi-3.5-vision/summary_results.json b/static/eval_results/Default/Phi-3.5-vision/summary_results.json new file mode 100644 index 0000000000000000000000000000000000000000..1e9d5d25bb8ca28106310878ffdeebc6788d2f0c --- /dev/null +++ b/static/eval_results/Default/Phi-3.5-vision/summary_results.json @@ -0,0 +1,251 @@ +{ + "model_summary": { + "core": { + "num_eval_tasks": 440, + "num_eval_samples": 6539, + "macro_mean_score": 0.22995297916629392, + "micro_mean_score": 0.22708502951025372 + }, + "open": { + "num_eval_tasks": 65, + "num_eval_samples": 1163, + "macro_mean_score": 0.3947914647737769, + "micro_mean_score": 0.42459157351676696 + }, + "overall_score": 0.2511698139474551 + }, + "keyword_stats": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.2550326045763433 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.24395249720074527 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.2858236369733704 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.29876274710122536 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.21972896566746963 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.37513466171380355 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.4713934837092732 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.25475240046465697 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.20386233377001492 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.06657701969095552 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.16556787388989183 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.17989790940001513 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.2671646581690049 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.24920333780186898 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.3057560384411286 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.3341992361416253 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.12884156381685322 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.20494682188374266 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.21180084406324556 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.2609992615064841 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.2149689274645855 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.365192668303297 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.2593652357274648 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.10107709750566891 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.11861055655587921 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.2824151476986241 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.24920333780186898 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.1980440594073205 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.2636292373854696 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.20747122167273002 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.08602953103518936 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.20136893467064246 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.30979039348232706 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.3495072422622861 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.25858403958844717 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.3357218088688187 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.21140555087788399 + } + } + } +} \ No newline at end of file diff --git a/static/eval_results/Default/Phi-3.5-vision/task_results.json b/static/eval_results/Default/Phi-3.5-vision/task_results.json new file mode 100644 index 0000000000000000000000000000000000000000..eba9431f34beb2807358932d6431ddcd32db923e --- /dev/null +++ b/static/eval_results/Default/Phi-3.5-vision/task_results.json @@ -0,0 +1,7756 @@ +[ + { + "name": "functionality_matching_in_different_objects", + "score": 0.17857142857142858, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "location_vqa", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "paper_review_rating", + "score": 0.6556210357219137, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "numerical_data", + "num_input": "4-5 images" + }, + { + "name": "scibench_calculus_wo_solution", + "score": 0.08163265306122448, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_find_odd_one_out", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "google_streetview_circle_sorting", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "planning_screenshot_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "relative_depth_of_different_points", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "recipe_image_ordering", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "rocks_samples_identify", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "sign_language", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "2d_image_jigsaw_puzzle_easy", + "score": 0.1738095238095238, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "booking_web_recommendation", + "score": 0.2679705215419501, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "signage_navigation", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "road_map_find_highway_between_two_place", + "score": 0.4117647058823529, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "video_eval_dynamic_pref", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "logical_reasoning_fit_pattern", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "medical_keywords_based_retrieval_non_radiology", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "entertainment_web_game_style", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "game_platform_support_identification", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ascii_art_understanding", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "scibench_fundamental_wo_solution", + "score": 0.10204081632653061, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "vln_hindi_next_step", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "GUI_Act_Mobile_swipe", + "score": 0.11483433671209539, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_grounding_temporal", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "extract_webpage_headline", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "distinguish_ai_generated_image", + "score": 0.42105263157894735, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_nested_squares", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_cell_recognition", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "booking_web_rating", + "score": 0.6071428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "video_eval_visual_pref", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "medical_parasite_detection", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "medical_image_artifacts_indentification", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_tyreworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_overlapped_circle", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "monthly_weather_days_count", + "score": 0.23809523809523808, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_visualization_output_understanding", + "score": 0.1, + "eval_type": "rule", + "num_demo": 1, + "num_query": 10, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "CLEVRER_physics", + "score": 0.15, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Mobile_tap", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "circuit_diagram_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "music_sheet_note_count", + "score": 0.058823529411764705, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "chess_find_legal_moves", + "score": 0.029751219517657808, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_translation_advanced", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "planning_visual_storage", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "multiview_reasoning_camera_moving", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "medical_counting_lymphocytes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_visual_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "code_execution", + "score": 0.0625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_sygyzy_endgames", + "score": 0.0326530612244898, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "video_action_recognition", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "structured_output", + "num_input": "video" + }, + { + "name": "music_sheet_author", + "score": 0.375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "rocks_samples_compare", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "comic_page_ordering", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "hashtag_recommendation", + "score": 0.5773809523809524, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "multilingual_news_qa", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "paper_review_acceptance", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "counting_multi_image", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "photoshop_operation", + "score": 0.18452380952380953, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "vln_identify_robot", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "9-image or more" + }, + { + "name": "geometry_reasoning_grid", + "score": 0.10714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_visual_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "annoying_word_search", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "healthcare_info_judgement", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Ethical and Safety Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "webpage_code_understanding", + "score": 0.7777777777777778, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "font_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "relative_reflectance_of_different_regions", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "vln_tegulu_next_step", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "medical_abdomen_endscopy_organ_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "calendar_schedule_suggestion", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "mensa_iq_test", + "score": 0.17401960784313728, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "google_streetview_direction_understanding", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "planning_visual_blocksworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "medical_multi_organ_segmentation_rater", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "product_ocr_qa", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "counting", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_blocksworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "semantic_matching_of_two_images", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "MMMU_pro_exam_screenshot", + "score": 0.12121212121212122, + "eval_type": "rule", + "num_demo": 1, + "num_query": 99, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "chinese_idiom_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_visual_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "ishihara_test", + "score": 0.028571428571428574, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "pokemon_3D_recognition", + "score": 0.03333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "Ad_count_detection", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "code_translation_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "visual_correspondance_in_two_images", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "video_eval_factual_pref", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "number_comparison", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "code_translation_Python", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "song_title_identification_from_lyrics", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "paper_vqa", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_translation_easy", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "TRANCE_physics_reasoning_event", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "polygon_interior_angles", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "play_go_capture_stone", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "music_sheet_name", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "medical_content_based_retrieval_radiology", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "mindmap_elements_parsing", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_segments_reordering", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "structured_output", + "num_input": "video" + }, + { + "name": "planning_screenshot_storage", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "rebus", + "score": 0.043478260869565216, + "eval_type": "rule", + "num_demo": 1, + "num_query": 23, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "music_sheet_format_QA", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "autonomous_driving_scene_analysis", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "worldle", + "score": 0.12530331014008708, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "transit_map_intersection_points", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "mahjong", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "Movie_retrieval_by_actor", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "music_sheet_sentiment", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "code_retrieval", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "LaTeX_complex_formula_convertion", + "score": 0.23529411764705882, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_intent_recognition", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "GUI_Act_Web_Single", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "image_translation_en2cn", + "score": 0.05222405626377131, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "actor_recognition_in_Movie", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "medical_blood_vessels_recognition", + "score": 0.5357142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "remaining_playback_time_calculation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ancient_map_understanding", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "media_recommend_solutions_stackoverflow", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "vln_english_next_step", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "waldo", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "medical_polyp_segmentation_single_object_rater", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "planning_screenshot_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "soccer_offside", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "code_match_problem", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "recover_masked_word_in_figure", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "top_video_creator_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "web_action_grounding", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "code_solution_compare", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "interpret_force_perspective_illusion", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "orchestra_score_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_grounding_spatial", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "logical_reasoning_2d_folding", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "GUI_Act_Web_Multi", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "llavaguard", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "google_streetview_circle_reasoning", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "TV_show_retrieval_by_character", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "icon_arithmetic_puzzle", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "vln_identify_location", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "planning_visual_grippers", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "geometry_reasoning_count_line_intersections", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_abdomen_MRI_organ_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "web_action_prediction", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "knowledge_graph_understanding", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "knowledge_sign_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "Forensic_Detection_of_different_images", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "position_relationship", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "Bongard_Problem", + "score": 0.10526315789473684, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_circled_letter", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_2D_views_of_3D_shapes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "dish_ingredient_match", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "media_homepage_profile", + "score": 0.12833594976452117, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "video_camera_motion_description", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "exact_text", + "num_input": "video" + }, + { + "name": "google_streetview_line_sorting", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "move_pos_to_pos_hanoi_4_pole", + "score": 0.0503968253968254, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "flowchart_code_generation", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "topological_sort", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "google_streetview_line_reasoning", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "TRANCE_physics_reasoning_view", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "planning_screenshot_grippers", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_retrieval_given_surgeon_activity", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Videos", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "TRANCE_physics_reasoning_basic", + "score": 0.29411764705882354, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "weather_map_climate_type_temperature_parsing", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "highest_discount_game_price_identification", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "media_QA_web_stackoverflow", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "newspaper_ocr_in_query_box", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "electricity_plot_future_prediction", + "score": 0.5203105263157894, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "coco_person_detection", + "score": 0.4903669849303062, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geometry_length", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "weather_info_parsing", + "score": 0.6468253968253969, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "av_view_identification", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "physical_property_reasoning", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "face_keypoint_detection", + "score": 0.7891263675852077, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_maxflow", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "av_vehicle_multiview_counting", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "6-8 images" + }, + { + "name": "geometry_transformation", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "chess_puzzle_single_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ili_ratio_future_prediction", + "score": 0.03700000000000008, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geometry_analytic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "mvsa_sentiment_classification", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "graph_theory", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "clevrer_video_moving_object_property_recognition", + "score": 0.375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "clevrer_video_moving_object_count", + "score": 0.19047619047619047, + "eval_type": "rule", + "num_demo": 1, + "num_query": 21, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "video" + }, + { + "name": "sta_action_localization_video", + "score": 0.1875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "code_programming_test_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "graph_hamiltonian_cycle", + "score": 0.2642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "vizwiz_quality_accessment_for_blind", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "funsd_document_qa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "single_person_pose_estimation", + "score": 0.032984275722218924, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "deciphering_oracle_bone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "license_plate_recognition", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "science_molecule_chemistry", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "snli_ve_visual_entailment", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geometry_solid", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "music_info_parsing", + "score": 0.11607142857142858, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_area", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "landmark_check_two_images", + "score": 0.3777777777777777, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "dvqa", + "score": 0.6842105263157895, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "cheapest_flight_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "code_programming_extremely_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Planning and Decision Making" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "widerface_face_count_and_event_classification", + "score": 0.5357142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "movie_info_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "long_string_letter_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chess_winner_identification", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "math_convexity_value_estimation", + "score": 0.3545876186407767, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "multi_load_type_prediction_from_plot", + "score": 0.4523809523809523, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "ti_fused_vqa_math", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "human_relationship_reasoning", + "score": 0.8125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Commonsense and Social Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "vibe_eval_short_phrase", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "av_multicamera_tracking_predict_bbox", + "score": 0.034222739980969856, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "9-image or more" + }, + { + "name": "perception_test_video_character_order", + "score": 0.4375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "question_solution_solving", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "game_info_parsing", + "score": 0.5194805194805193, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "top_rated_hotel_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "nextqa_mc", + "score": 0.631578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "stock_price_future_prediction", + "score": 0.7397142857142861, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "algebra", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "graph_shortest_path_planar", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "action_sequence", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "9-image or more" + }, + { + "name": "map_diagram_qa", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "places365_similar_scene_retrieval", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "graph_shortest_path_kamada_kawai", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "star_object_interaction_video", + "score": 0.125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "graph_chordless_cycle", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "waybill_number_sequence_extraction", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "iconqa", + "score": 0.3157894736842105, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "graph_connectivity", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "mnist_pattern", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "clevrer_moving_direction_video", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "egocentric_analysis_single_image", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "image_style_recognition", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "weather_info_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "geometry_descriptive", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "IAM_line_ocr_and_locate", + "score": 0.39135519540824326, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "electricity_load_estimate_plot", + "score": 0.5469999999999999, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "insect_order_classification", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "perception_test_object_shuffle_video", + "score": 0.125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "muma_theory_of_mind_social_goal", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "ti_fused_vqa_biology", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "youtube_video_info_parsing", + "score": 0.16666666666666669, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "muma_theory_of_mind_belief_of_goal", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "quizlet_question_solving", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_chemistry", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "clevr_arithmetic", + "score": 0.3684210526315789, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "exchange_rate_estimate_plot", + "score": 0.9458071428571426, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "multilingual_movie_info_parsing", + "score": 0.2653061224489795, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "hotel_booking_confirmation_parsing", + "score": 0.1285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_programming_test_easy", + "score": 0.020833333333333332, + "eval_type": "rule", + "num_demo": 1, + "num_query": 24, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "brand_logo_recognition_and_elaboration", + "score": 0.52, + "eval_type": "rule", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "arc_agi", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "movie_info_parsing", + "score": 0.19642857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "3d_indoor_scene_text_bbox_prediction", + "score": 0.018377650164657217, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "video_to_camera_trajectory_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "TV_show_info_parsing", + "score": 0.4523809523809524, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "stock_info_parsing", + "score": 0.6092436974789915, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "math_parity", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "visualdial_visual_dialog_image_guessing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "famous_building_recognition", + "score": 0.40625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "egocentric_spatial_reasoning", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "nlvr2_two_image_compare_qa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "vlnqa_egocentric_navigation_video", + "score": 0.1875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "graph_isomorphism", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "landmark_recognition_and_qa", + "score": 0.24444444444444446, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "face_identity_matching", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "4-5 images" + }, + { + "name": "music_info_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "clevrer_object_existence_video", + "score": 0.5625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "cam_traj_to_video_selection", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "video_content_reasoning", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "3d_fragments_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "ti_fused_vqa_physics", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "pmc_vqa_medical_image_qa", + "score": 0.47368421052631576, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "long_string_number_recognition", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "electricity_future_prediction_from_table", + "score": 0.6657894736842106, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "places365_scene_type_classification", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_error_line_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "code_programming_test_advanced", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "super_clevr", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "animal_pose_estimation", + "score": 0.21324372091628846, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "logo2k_same_type_logo_retrieval", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "science_basic_physics", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "math_breakpoint", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "newspaper_page_parse_and_count", + "score": 0.4666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "multilingual_game_info_parsing", + "score": 0.30357142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "signboard_identification", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "chart_vqa", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "action_prediction", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "average_humidity_estimate_plot", + "score": 0.28000000000000014, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "photo_sharing_image_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "humor_understand_caption_match", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "game_info_retrieval", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "emotion_recognition", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "6-8 images" + }, + { + "name": "figureqa", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "av_human_multiview_counting", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "6-8 images" + }, + { + "name": "perception_test_video_action_count", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "video" + }, + { + "name": "stock_info_retrieval", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "coco_object_detection_by_query_property", + "score": 0.34326122251920943, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "graph_hamiltonian_path", + "score": 0.44642857142857145, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "coco_ood_global_image_retrieval_by_query_property", + "score": 0.27456709956709957, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "traffic_future_prediction_from_line_plot", + "score": 0.34536842105263166, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_crushing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_equality", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_checkmate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_papers", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_indian_celebrity", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_chinese_celebrity", + "score": 0.17857142857142858, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_famous_treaty", + "score": 0.39285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "reward_models_T2I_reward", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "reward_models_I2T_reward", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "crossword_mini_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_motion_matching_real_3D", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "video_motion_matching_3D_real", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "number_puzzle_sudoku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "number_puzzle_kakuro_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "panel_images_single_question", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "panel_images_multi_question", + "score": 0.4523809523809523, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "table_understanding_complex_question_answering", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "table_understanding_fact_verification", + "score": 0.4285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_position_images", + "score": 0.4, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_xor_images", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_repeat_length", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_contain_length", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_multi_contain_position_only", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_length", + "score": 0.4, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_contain_images", + "score": 0.8666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_multi_contain_repeat", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_position_length", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_multi_contain_repeat_position_only_length", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "maze_2d_8x8", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_PolitiFact", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_GossipCop", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Object Recognition and Classification", + "Scene and Event Understanding", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_HatefulMemes", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Memotion", + "score": 0.6470588235294118, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_latex", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_csv", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_markdown", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_experience_plain", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_employer_plain", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_article_authors", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_article_journal", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_skill_plain", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_text_latex", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_equation", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_html", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_school_plain", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "pictionary_doodle_guess", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_chinese_food_img2en", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_cartoon_drawing_guess", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_genai_output_chinese", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_skribbl_io", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "counterfactual_arithmetic", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_youtube", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_iphone_settings", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_notes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_word", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_alipay", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_twitter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_leetcode", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_tiktok", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_excel", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_instagram", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_ppt", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_zoom", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_amazon", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_layout_understanding_youtube", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_iphone_settings", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_word", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_alipay", + "score": 0.11764705882352941, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_twitter", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_leetcode", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_tiktok", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_excel", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_instagram", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_ppt", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_zoom", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_amazon", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_attribute_edit", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_out_of_context", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_background_change", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_entity_replace", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_clip_stable_diffusion_generate", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_style", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_veracity", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_swap", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_blogpost", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_homepage", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_publication", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "autorater_3d_model_texturing", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Metrics", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_subject", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_unmask", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_artifact_reason", + "score": 0.6, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "autorater_mask", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_aesthetics", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_motion_guided_editing", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "autorater_artifact", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "autorater_control", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_semantics", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "shape_composition_shapes", + "score": 0.09821428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "shape_composition_colours", + "score": 0.16808390022675737, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_math_TheoremQA", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_MATH", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "ball_cup_swap_3", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "poetry_petrarchian_sonnet_optional_meter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_shakespearean_sonnet", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_custom_rhyming_scheme", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_haiku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic_alliteration", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_limerick", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "geographic_remote_sensing_land_cover", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "visual_prediction_rater_novel_view_synthesis", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_semantic_segmentation", + "score": 0.10416666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_panoptic_segmentation", + "score": 0.09523809523809523, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_surface_normal_estimation", + "score": 0.11904761904761904, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_openable_part_segmentation", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_plane_segmentation", + "score": 0.2222222222222222, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_depth_estimation", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_3d_assembled_quality_understanding", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "symbolic_graphics_programs_computer_aided_design", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "symbolic_graphics_programs_scalable_vector_graphics", + "score": 0.05555555555555555, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_depth", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_relation", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_distance", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_count", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_africa", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_asia", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_americas", + "score": 0.08571428571428572, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_europe", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "character_recognition_in_TV_shows", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "cultural_vqa", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "tqa_textbook_qa", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_output_result", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "kvqa_knowledge_aware_qa", + "score": 0.47368421052631576, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "painting_QA", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "MMMU_physics_chemistry_MCQ", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "realworld_qa_en2cn", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "arxiv_vqa", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "code_add_tag", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "3d_indoor_scene_text_bbox_selection", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "table_understanding", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "handwritten_math_expression_extraction", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_notes", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "figurative_speech_explanation", + "score": 0.6724137931034484, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "tweets_captioning", + "score": 0.37857142857142856, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "science_figure_explanation", + "score": 0.510344827586207, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "iq_test", + "score": 0.2448275862068965, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "sceneqa_scene_transition_video", + "score": 0.2, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "wikihow_complex_task_completion", + "score": 0.3888888888888889, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "9-image or more" + }, + { + "name": "funqa_unexpected_action_humor_video", + "score": 0.24000000000000005, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "image_captioning_with_additional_requirements", + "score": 0.7857142857142858, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "visualization_with_code", + "score": 0.4857142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocrqa", + "score": 0.6241379310344828, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "activitynetqa", + "score": 0.5, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "generated_video_artifacts", + "score": 0.125, + "eval_type": "llm", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "paper_review_writing", + "score": 0.43333333333333335, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "4-5 images" + }, + { + "name": "table2latex_complex", + "score": 0.3555555555555555, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "funqa_unexpected_action_creative_video", + "score": 0.18666666666666665, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "graph_interpretation", + "score": 0.5655172413793104, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "defeasible_reasoning", + "score": 0.49999999999999994, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "image_humor_understanding", + "score": 0.7241379310344829, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "doc_vqa", + "score": 0.6562499999999999, + "eval_type": "llm", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "4-5 images" + }, + { + "name": "video2notes", + "score": 0.4571428571428572, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "bar_chart_interpretation", + "score": 0.45862068965517233, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_short_title", + "score": 0.1857142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "humor_explanation", + "score": 0.24666666666666667, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "nextqa_oe", + "score": 0.25789473684210523, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "vibe-eval", + "score": 0.45000000000000007, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_summary", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "art_explanation", + "score": 0.3931034482758621, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "traffic_accident_analysis", + "score": 0.47857142857142865, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_content_follow_up", + "score": 0.4642857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Planning and Decision Making" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "electrocardiogram", + "score": 0.2857142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_qa", + "score": 0.6214285714285713, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "GUI_Chat_Easy", + "score": 0.6076923076923078, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funny_image_title", + "score": 0.6285714285714287, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "scibench_w_solution_open_ended", + "score": 0.052000000000000005, + "eval_type": "llm", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "meme_explain", + "score": 0.5857142857142856, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "GUI_Chat_Hard", + "score": 0.40967741935483876, + "eval_type": "llm", + "num_demo": 1, + "num_query": 31, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funqa_unexpected_action_magic_video", + "score": 0.26666666666666666, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "unusual_images", + "score": 0.6310344827586207, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "guess_image_generation_prompt", + "score": 0.7526315789473683, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_detail_description", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "docci_image_description_long", + "score": 0.49285714285714277, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "ascii_art_30", + "score": 0.07142857142857142, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Spanish", + "score": 0.14999999999999997, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_French", + "score": 0.014285714285714287, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Russian", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_English", + "score": 0.0642857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Arabic", + "score": 0.09999999999999999, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Japanese", + "score": 0.05714285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "table_understanding_fetaqa", + "score": 0.49999999999999994, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_advanced", + "score": 0.5785714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_expert", + "score": 0.3714285714285715, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_worldclass", + "score": 0.2642857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Spanish", + "score": 0.06428571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_French", + "score": 0.05714285714285715, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Russian", + "score": 0.04285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Chinese", + "score": 0.049999999999999996, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Arabic", + "score": 0.0071428571428571435, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "red_teaming_visual_order_B", + "score": 0.8578947368421055, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "red_teaming_captcha", + "score": 0.10000000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visualmisleading", + "score": 0.8894736842105264, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_celebrity", + "score": 0.8950000000000001, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_politics", + "score": 0.655, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_jailbreak", + "score": 0.915, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Text Recognition (OCR)", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visual_order_A", + "score": 0.8631578947368422, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "red_teaming_racial", + "score": 0.7900000000000001, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + } +] \ No newline at end of file diff --git a/static/eval_results/Default/Pixtral_12B/summary_results.json b/static/eval_results/Default/Pixtral_12B/summary_results.json new file mode 100644 index 0000000000000000000000000000000000000000..d7b2c538d50bf2b1e42d3ba272fa87d54e676a20 --- /dev/null +++ b/static/eval_results/Default/Pixtral_12B/summary_results.json @@ -0,0 +1,251 @@ +{ + "model_summary": { + "core": { + "num_eval_tasks": 440, + "num_eval_samples": 6539, + "macro_mean_score": 0.31362045151669854, + "micro_mean_score": 0.3100986209078182 + }, + "open": { + "num_eval_tasks": 65, + "num_eval_samples": 1163, + "macro_mean_score": 0.4566234428542061, + "micro_mean_score": 0.4870593293207223 + }, + "overall_score": 0.33202677713439754 + }, + "keyword_stats": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.34184129499032456 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.37667712211439836 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.37896441862738645 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.37077191302051077 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.2843861774995234 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.4098150360139686 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.533077694235589 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.3372902862054838 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.25372282838901716 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.09524894246403817 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.2972619996610934 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.28304049684103855 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.33523333364720703 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.3988260865341648 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.39117521970978353 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.35583482417594536 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.21897822147396953 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.3436473210057542 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.28979044279399635 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.33530850344530555 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.30160980000905374 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.4166613092238044 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.30796171250186904 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.22871315192743763 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.21669652626580332 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.36087312117067055 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.3988260865341648 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.24616927284658197 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.2900329121369093 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.42652313209316933 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.1209559708312353 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.25678368121442124 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.37605128363484847 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.4576088857728113 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.3464929909487855 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.3858431845580602 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.2549787156825223 + } + } + } +} \ No newline at end of file diff --git a/static/eval_results/Default/Pixtral_12B/task_results.json b/static/eval_results/Default/Pixtral_12B/task_results.json new file mode 100644 index 0000000000000000000000000000000000000000..39841de7705add7cd173cac13a05a4acf825453e --- /dev/null +++ b/static/eval_results/Default/Pixtral_12B/task_results.json @@ -0,0 +1,7756 @@ +[ + { + "name": "face_identity_matching", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "4-5 images" + }, + { + "name": "IAM_line_ocr_and_locate", + "score": 0.501198167796616, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "science_molecule_chemistry", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "weather_info_retrieval", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "signboard_identification", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "av_multicamera_tracking_predict_bbox", + "score": 0.021818162950542508, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "9-image or more" + }, + { + "name": "funsd_document_qa", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "physical_property_reasoning", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "geometry_area", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geometry_analytic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "map_diagram_qa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "clevr_arithmetic", + "score": 0.3684210526315789, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "graph_connectivity", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "visualdial_visual_dialog_image_guessing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "graph_isomorphism", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "clevrer_object_existence_video", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "question_solution_solving", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "snli_ve_visual_entailment", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_chemistry", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ili_ratio_future_prediction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "image_style_recognition", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "multilingual_movie_info_parsing", + "score": 0.5204081632653061, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "star_object_interaction_video", + "score": 0.3125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "video_content_reasoning", + "score": 0.8888888888888888, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "geometry_transformation", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "places365_similar_scene_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "vibe_eval_short_phrase", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "cam_traj_to_video_selection", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "math_breakpoint", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "photo_sharing_image_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "perception_test_video_character_order", + "score": 0.4375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "coco_ood_global_image_retrieval_by_query_property", + "score": 0.6271825396825397, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "arc_agi", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "vizwiz_quality_accessment_for_blind", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "av_human_multiview_counting", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "6-8 images" + }, + { + "name": "waybill_number_sequence_extraction", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "emotion_recognition", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "6-8 images" + }, + { + "name": "brand_logo_recognition_and_elaboration", + "score": 0.72, + "eval_type": "rule", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "license_plate_recognition", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chess_puzzle_single_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_programming_test_easy", + "score": 0.20833333333333334, + "eval_type": "rule", + "num_demo": 1, + "num_query": 24, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_winner_identification", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chart_vqa", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "math_convexity_value_estimation", + "score": 0.24488748504919397, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "3d_indoor_scene_text_bbox_prediction", + "score": 0.0453150580624047, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "long_string_letter_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "dvqa", + "score": 0.6842105263157895, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "egocentric_spatial_reasoning", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "music_info_retrieval", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "quizlet_question_solving", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "coco_person_detection", + "score": 0.6312894180411065, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "clevrer_moving_direction_video", + "score": 0.125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "graph_maxflow", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "game_info_parsing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_hamiltonian_cycle", + "score": 0.20833333333333334, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "av_vehicle_multiview_counting", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "6-8 images" + }, + { + "name": "animal_pose_estimation", + "score": 0.23775530155879165, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "humor_understand_caption_match", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "super_clevr", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "average_humidity_estimate_plot", + "score": 0.6880000000000002, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "nlvr2_two_image_compare_qa", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "sta_action_localization_video", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "face_keypoint_detection", + "score": 0.6488432934203903, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "insect_order_classification", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "deciphering_oracle_bone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "places365_scene_type_classification", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "science_basic_physics", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "perception_test_object_shuffle_video", + "score": 0.1875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "long_string_number_recognition", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_programming_test_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "video_to_camera_trajectory_retrieval", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "cheapest_flight_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "newspaper_ocr_in_query_box", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "stock_info_retrieval", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "graph_theory", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "landmark_recognition_and_qa", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "muma_theory_of_mind_social_goal", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "clevrer_video_moving_object_property_recognition", + "score": 0.75, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "action_sequence", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "9-image or more" + }, + { + "name": "game_info_retrieval", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "av_view_identification", + "score": 0.04444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "code_programming_test_advanced", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "muma_theory_of_mind_belief_of_goal", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "code_programming_extremely_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Planning and Decision Making" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "geometry_solid", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "top_rated_hotel_identification", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "stock_info_parsing", + "score": 0.6176470588235295, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "pmc_vqa_medical_image_qa", + "score": 0.631578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "3d_fragments_understanding", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "electricity_future_prediction_from_table", + "score": 0.6796315789473685, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "TV_show_info_parsing", + "score": 0.6666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "coco_object_detection_by_query_property", + "score": 0.5612662313600751, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "vlnqa_egocentric_navigation_video", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "landmark_check_two_images", + "score": 0.6222222222222221, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "figureqa", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_biology", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "traffic_future_prediction_from_line_plot", + "score": 0.497842105263158, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "mvsa_sentiment_classification", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "stock_price_future_prediction", + "score": 0.7906428571428573, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multi_load_type_prediction_from_plot", + "score": 0.4404761904761904, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "hotel_booking_confirmation_parsing", + "score": 0.48571428571428577, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_shortest_path_planar", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "action_prediction", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "code_error_line_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "math_parity", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "famous_building_recognition", + "score": 0.84375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "multilingual_game_info_parsing", + "score": 0.3392857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "weather_info_parsing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_shortest_path_kamada_kawai", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "widerface_face_count_and_event_classification", + "score": 0.6071428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "nextqa_mc", + "score": 0.7894736842105263, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "ti_fused_vqa_physics", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "movie_info_retrieval", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "graph_chordless_cycle", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "clevrer_video_moving_object_count", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 21, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "video" + }, + { + "name": "geometry_length", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "algebra", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "human_relationship_reasoning", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Commonsense and Social Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "exchange_rate_estimate_plot", + "score": 0.9798999999999998, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "electricity_load_estimate_plot", + "score": 0.5264285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "logo2k_same_type_logo_retrieval", + "score": 0.6785714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "single_person_pose_estimation", + "score": 0.15948320227617638, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_math", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "electricity_plot_future_prediction", + "score": 0.48810526315789476, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "graph_hamiltonian_path", + "score": 0.33511904761904754, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "iconqa", + "score": 0.3684210526315789, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "movie_info_parsing", + "score": 0.6964285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "youtube_video_info_parsing", + "score": 0.48809523809523814, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "egocentric_analysis_single_image", + "score": 0.2222222222222222, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "perception_test_video_action_count", + "score": 0.3125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "video" + }, + { + "name": "music_info_parsing", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "mnist_pattern", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "newspaper_page_parse_and_count", + "score": 0.2888888888888889, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_descriptive", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_relation", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_depth", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_distance", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_count", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geographic_remote_sensing_land_cover", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "symbolic_graphics_programs_computer_aided_design", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "symbolic_graphics_programs_scalable_vector_graphics", + "score": 0.05555555555555555, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_asia", + "score": 0.12857142857142856, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_americas", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_europe", + "score": 0.17142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_africa", + "score": 0.1571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "visual_prediction_rater_plane_segmentation", + "score": 0.17777777777777776, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_novel_view_synthesis", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_3d_assembled_quality_understanding", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "visual_prediction_rater_semantic_segmentation", + "score": 0.10416666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_panoptic_segmentation", + "score": 0.09523809523809523, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_surface_normal_estimation", + "score": 0.16666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_depth_estimation", + "score": 0.13095238095238096, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_openable_part_segmentation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "medical_abdomen_MRI_organ_recognition", + "score": 0.23809523809523808, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "visual_correspondance_in_two_images", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "google_streetview_circle_reasoning", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "vln_tegulu_next_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "flowchart_code_generation", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "video_eval_factual_pref", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "medical_polyp_segmentation_single_object_rater", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "recipe_image_ordering", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "healthcare_info_judgement", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Ethical and Safety Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "relative_depth_of_different_points", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "planning_visual_storage", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_visualization_output_understanding", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 10, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "Ad_count_detection", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "TRANCE_physics_reasoning_view", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "video_grounding_temporal", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "chess_sygyzy_endgames", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_visual_blocksworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "web_action_grounding", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "position_relationship", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Web_Multi", + "score": 0.21684615697123963, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "webpage_code_understanding", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "vln_english_next_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "logical_reasoning_2d_folding", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "extract_webpage_headline", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_translation_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "music_sheet_note_count", + "score": 0.058823529411764705, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_circled_letter", + "score": 0.39285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_find_odd_one_out", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "paper_vqa", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "medical_abdomen_endscopy_organ_recognition", + "score": 0.047619047619047616, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "worldle", + "score": 0.17302345614703815, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "medical_keywords_based_retrieval_non_radiology", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "remaining_playback_time_calculation", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "sign_language", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "video_camera_motion_description", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "exact_text", + "num_input": "video" + }, + { + "name": "video_segments_reordering", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "structured_output", + "num_input": "video" + }, + { + "name": "interpret_force_perspective_illusion", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Web_Single", + "score": 0.038726333907056806, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "hashtag_recommendation", + "score": 0.8869047619047619, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_tyreworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_grounding_spatial", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "weather_map_climate_type_temperature_parsing", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_match_problem", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "music_sheet_sentiment", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "scibench_calculus_wo_solution", + "score": 0.12244897959183673, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_eval_visual_pref", + "score": 0.5625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "image_translation_en2cn", + "score": 0.3963641386180832, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "game_platform_support_identification", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "music_sheet_format_QA", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "CLEVRER_physics", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "location_vqa", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "semantic_matching_of_two_images", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "planning_visual_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "geometry_reasoning_nested_squares", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "circuit_diagram_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "scibench_fundamental_wo_solution", + "score": 0.08163265306122448, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_visual_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "planning_visual_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "code_retrieval", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "medical_blood_vessels_recognition", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_cell_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_blocksworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_storage", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "distinguish_ai_generated_image", + "score": 0.631578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_count_line_intersections", + "score": 0.17857142857142858, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_grippers", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "booking_web_recommendation", + "score": 0.5994897959183675, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "photoshop_operation", + "score": 0.16972789115646256, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "llavaguard", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "media_homepage_profile", + "score": 0.10810255920550038, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "vln_hindi_next_step", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "media_recommend_solutions_stackoverflow", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "dish_ingredient_match", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "medical_retrieval_given_surgeon_activity", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Videos", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "google_streetview_direction_understanding", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "music_sheet_name", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "code_translation_easy", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "ancient_map_understanding", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "mindmap_elements_parsing", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "autonomous_driving_scene_analysis", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_2D_views_of_3D_shapes", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "paper_review_acceptance", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "rocks_samples_compare", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "google_streetview_line_reasoning", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "orchestra_score_recognition", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "highest_discount_game_price_identification", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "video_intent_recognition", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "counting", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "media_QA_web_stackoverflow", + "score": 0.5952380952380951, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_solution_compare", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "soccer_offside", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "2d_image_jigsaw_puzzle_easy", + "score": 0.19047619047619044, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "medical_content_based_retrieval_radiology", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "song_title_identification_from_lyrics", + "score": 0.32142857142857145, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MMMU_pro_exam_screenshot", + "score": 0.23232323232323232, + "eval_type": "rule", + "num_demo": 1, + "num_query": 99, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "multiview_reasoning_camera_moving", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "medical_multi_organ_segmentation_rater", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "knowledge_graph_understanding", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "rocks_samples_identify", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "mensa_iq_test", + "score": 0.31813725490196076, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "play_go_capture_stone", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "booking_web_rating", + "score": 0.8928571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "web_action_prediction", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "top_video_creator_identification", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_visual_grippers", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "annoying_word_search", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Mobile_tap", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "comic_page_ordering", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "geometry_reasoning_grid", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "TRANCE_physics_reasoning_event", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "planning_screenshot_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Mobile_swipe", + "score": 0.5707553325488358, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "recover_masked_word_in_figure", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "vln_identify_robot", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "9-image or more" + }, + { + "name": "multilingual_news_qa", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "move_pos_to_pos_hanoi_4_pole", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "relative_reflectance_of_different_regions", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "counting_multi_image", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "code_translation_advanced", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "medical_image_artifacts_indentification", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "signage_navigation", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "transit_map_intersection_points", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_execution", + "score": 0.1875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "icon_arithmetic_puzzle", + "score": 0.5357142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_overlapped_circle", + "score": 0.6071428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "google_streetview_circle_sorting", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "chinese_idiom_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "knowledge_sign_recognition", + "score": 0.2222222222222222, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "monthly_weather_days_count", + "score": 0.3095238095238096, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "calendar_schedule_suggestion", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "music_sheet_author", + "score": 0.125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "ascii_art_understanding", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "chess_find_legal_moves", + "score": 0.011188369871131833, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "topological_sort", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "entertainment_web_game_style", + "score": 0.8214285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_action_recognition", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "structured_output", + "num_input": "video" + }, + { + "name": "functionality_matching_in_different_objects", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "Movie_retrieval_by_actor", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "TV_show_retrieval_by_character", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "Forensic_Detection_of_different_images", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "polygon_interior_angles", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "pokemon_3D_recognition", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "mahjong", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "font_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "paper_review_rating", + "score": 0.6077729535514645, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "numerical_data", + "num_input": "4-5 images" + }, + { + "name": "number_comparison", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "medical_counting_lymphocytes", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "actor_recognition_in_Movie", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "TRANCE_physics_reasoning_basic", + "score": 0.47058823529411764, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "waldo", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "google_streetview_line_sorting", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "video_eval_dynamic_pref", + "score": 0.5625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "product_ocr_qa", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "Bongard_Problem", + "score": 0.34210526315789475, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_translation_Python", + "score": 0.22916666666666669, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "logical_reasoning_fit_pattern", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "road_map_find_highway_between_two_place", + "score": 0.5294117647058824, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ishihara_test", + "score": 0.1857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_parasite_detection", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "LaTeX_complex_formula_convertion", + "score": 0.23529411764705882, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "vln_identify_location", + "score": 0.14545454545454545, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "rebus", + "score": 0.043478260869565216, + "eval_type": "rule", + "num_demo": 1, + "num_query": 23, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "constrained_generation_multi_contain_repeat_position_only_length", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_multi_contain_repeat", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_position_length", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_xor_images", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_position_images", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_repeat_length", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_contain_length", + "score": 0.8, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_multi_contain_position_only", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_contain_images", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_length", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_equality", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_crushing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_checkmate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "panel_images_single_question", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "panel_images_multi_question", + "score": 0.7380952380952381, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_tiktok", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_notes", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_alipay", + "score": 0.11764705882352941, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_amazon", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_instagram", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_ppt", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_leetcode", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_twitter", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_zoom", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_iphone_settings", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_youtube", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_excel", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_word", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "crossword_mini_5x5", + "score": 0.021428571428571432, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MMSoc_HatefulMemes", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_GossipCop", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Object Recognition and Classification", + "Scene and Event Understanding", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_PolitiFact", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Memotion", + "score": 0.6000000000000002, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "reward_models_I2T_reward", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "reward_models_T2I_reward", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "ocr_math_MATH", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "ocr_math_TheoremQA", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "counterfactual_arithmetic", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_homepage", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_publication", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_blogpost", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "table_understanding_complex_question_answering", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "table_understanding_fact_verification", + "score": 0.8214285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_swap", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_style", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_clip_stable_diffusion_generate", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_veracity", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_attribute_edit", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_entity_replace", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_out_of_context", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_background_change", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "video_motion_matching_3D_real", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "video_motion_matching_real_3D", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "number_puzzle_kakuro_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "number_puzzle_sudoku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ball_cup_swap_3", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "maze_2d_8x8", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "shape_composition_shapes", + "score": 0.30025510204081635, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "shape_composition_colours", + "score": 0.22222222222222224, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_chinese_celebrity", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_indian_celebrity", + "score": 0.32142857142857145, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_papers", + "score": 0.3, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_famous_treaty", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_csv", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_school_plain", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_text_latex", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_skill_plain", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_experience_plain", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_article_authors", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_markdown", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_html", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_employer_plain", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_equation", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_article_journal", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_latex", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "autorater_subject", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_artifact", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "autorater_control", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_aesthetics", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_mask", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_artifact_reason", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "autorater_semantics", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_motion_guided_editing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "autorater_3d_model_texturing", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Metrics", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_unmask", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "poetry_haiku", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_limerick", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_petrarchian_sonnet_optional_meter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_custom_rhyming_scheme", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic_alliteration", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_shakespearean_sonnet", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "pictionary_cartoon_drawing_guess", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_doodle_guess", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_skribbl_io", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_genai_output_chinese", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_chinese_food_img2en", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_tiktok", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_alipay", + "score": 0.5882352941176471, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_amazon", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_instagram", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_ppt", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_leetcode", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_twitter", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_zoom", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_iphone_settings", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_youtube", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_excel", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_word", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "tqa_textbook_qa", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_output_result", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "kvqa_knowledge_aware_qa", + "score": 0.3684210526315789, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "cultural_vqa", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "character_recognition_in_TV_shows", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "3d_indoor_scene_text_bbox_selection", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "table_understanding", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "handwritten_math_expression_extraction", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MMMU_physics_chemistry_MCQ", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_add_tag", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "realworld_qa_en2cn", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "arxiv_vqa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "painting_QA", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "app_layout_understanding_notes", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "scibench_w_solution_open_ended", + "score": 0.19799999999999998, + "eval_type": "llm", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "electrocardiogram", + "score": 0.21428571428571427, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funny_image_title", + "score": 0.45714285714285713, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "image_captioning_with_additional_requirements", + "score": 0.8571428571428573, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "meme_explain", + "score": 0.5428571428571429, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "image_humor_understanding", + "score": 0.7275862068965517, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "doc_vqa", + "score": 0.8312500000000002, + "eval_type": "llm", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "4-5 images" + }, + { + "name": "generated_video_artifacts", + "score": 0.4375, + "eval_type": "llm", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "sceneqa_scene_transition_video", + "score": 0.19999999999999998, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "nextqa_oe", + "score": 0.35263157894736835, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "tweets_captioning", + "score": 0.5071428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "defeasible_reasoning", + "score": 0.7137931034482758, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "vibe-eval", + "score": 0.5142857142857142, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "ocrqa", + "score": 0.7827586206896552, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_content_follow_up", + "score": 0.65, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Planning and Decision Making" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "figurative_speech_explanation", + "score": 0.7206896551724138, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "wikihow_complex_task_completion", + "score": 0.6888888888888888, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "9-image or more" + }, + { + "name": "docci_image_description_long", + "score": 0.5928571428571427, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_detail_description", + "score": 0.45263157894736833, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "video2notes", + "score": 0.5285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "funqa_unexpected_action_magic_video", + "score": 0.48666666666666664, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "video_qa", + "score": 0.7928571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "bar_chart_interpretation", + "score": 0.5896551724137932, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_summary", + "score": 0.5785714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "iq_test", + "score": 0.27586206896551724, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funqa_unexpected_action_creative_video", + "score": 0.24666666666666667, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "unusual_images", + "score": 0.6172413793103447, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "GUI_Chat_Hard", + "score": 0.4387096774193549, + "eval_type": "llm", + "num_demo": 1, + "num_query": 31, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "graph_interpretation", + "score": 0.7448275862068967, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "traffic_accident_analysis", + "score": 0.18571428571428575, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "humor_explanation", + "score": 0.4666666666666668, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funqa_unexpected_action_humor_video", + "score": 0.28666666666666674, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "video_short_title", + "score": 0.65, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "GUI_Chat_Easy", + "score": 0.7384615384615386, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "table2latex_complex", + "score": 0.43333333333333346, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "activitynetqa", + "score": 0.3631578947368421, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "visualization_with_code", + "score": 0.37142857142857144, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "science_figure_explanation", + "score": 0.5689655172413793, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "guess_image_generation_prompt", + "score": 0.8000000000000002, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "art_explanation", + "score": 0.48965517241379314, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "paper_review_writing", + "score": 0.5533333333333333, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "4-5 images" + }, + { + "name": "bridge_strategies_worldclass", + "score": 0.2285714285714285, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_expert", + "score": 0.17142857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_advanced", + "score": 0.08571428571428573, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Spanish", + "score": 0.021428571428571432, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Arabic", + "score": 0.0071428571428571435, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Japanese", + "score": 0.028571428571428574, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Russian", + "score": 0.0071428571428571435, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_English", + "score": 0.07142857142857142, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_French", + "score": 0.021428571428571432, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "table_understanding_fetaqa", + "score": 0.4571428571428572, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visual_order_B", + "score": 0.8842105263157897, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "red_teaming_racial", + "score": 0.7700000000000001, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_captcha", + "score": 0.10000000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_celebrity", + "score": 0.725, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visual_order_A", + "score": 0.905263157894737, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "red_teaming_politics", + "score": 0.6799999999999999, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visualmisleading", + "score": 0.8947368421052632, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_jailbreak", + "score": 0.53, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Text Recognition (OCR)", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Spanish", + "score": 0.33571428571428574, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Arabic", + "score": 0.16428571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Russian", + "score": 0.29285714285714287, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_French", + "score": 0.3428571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Chinese", + "score": 0.16428571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "ascii_art_30", + "score": 0.14285714285714285, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + } +] \ No newline at end of file diff --git a/static/eval_results/Default/Qwen2_VL_2B/summary_results.json b/static/eval_results/Default/Qwen2_VL_2B/summary_results.json new file mode 100644 index 0000000000000000000000000000000000000000..76a71eabec4ee5d88551bf968f232ee13dffdc5a --- /dev/null +++ b/static/eval_results/Default/Qwen2_VL_2B/summary_results.json @@ -0,0 +1,251 @@ +{ + "model_summary": { + "core": { + "num_eval_tasks": 440, + "num_eval_samples": 6539, + "macro_mean_score": 0.20877163406364055, + "micro_mean_score": 0.20561526268932287 + }, + "open": { + "num_eval_tasks": 65, + "num_eval_samples": 1163, + "macro_mean_score": 0.3154302566225611, + "micro_mean_score": 0.33856405846947557 + }, + "overall_score": 0.22249997162072932 + }, + "keyword_stats": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.22236161923122505 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.23701014663017753 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.25669221785292334 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.26526414975225454 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.17623548305581763 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.31250702198481506 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.4140676691729323 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.20802820480076603 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.17320633068307653 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.06209506566980099 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.190837839372028 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.16287824421269087 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.19640906475019812 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.2520741776922928 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.24883076673424442 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.2877316297453947 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.13398525561847363 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.1624451002757208 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.20960092816529263 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.19986806708136184 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.2201024015934558 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.30248748033122763 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.256631742010999 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.07681405895691609 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.10526691703628158 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.25018977062352593 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.2520741776922928 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.17435940889565366 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.21286783416184518 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.2521972668785968 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.06967138760493456 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.16996250112948405 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.27603334911345223 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.31002436092347696 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.21061929716065056 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.2656728023444808 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.16356158787929762 + } + } + } +} \ No newline at end of file diff --git a/static/eval_results/Default/Qwen2_VL_2B/task_results.json b/static/eval_results/Default/Qwen2_VL_2B/task_results.json new file mode 100644 index 0000000000000000000000000000000000000000..1731cf4310404cc32e815ba9a38308438fd0969f --- /dev/null +++ b/static/eval_results/Default/Qwen2_VL_2B/task_results.json @@ -0,0 +1,7756 @@ +[ + { + "name": "functionality_matching_in_different_objects", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "location_vqa", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "paper_review_rating", + "score": 0.6794157898981115, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "numerical_data", + "num_input": "4-5 images" + }, + { + "name": "scibench_calculus_wo_solution", + "score": 0.04081632653061224, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_find_odd_one_out", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "google_streetview_circle_sorting", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "planning_screenshot_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "relative_depth_of_different_points", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "recipe_image_ordering", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "rocks_samples_identify", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "sign_language", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "2d_image_jigsaw_puzzle_easy", + "score": 0.20119047619047617, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "booking_web_recommendation", + "score": 0.4712868480725624, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "signage_navigation", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "painting_QA", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "road_map_find_highway_between_two_place", + "score": 0.17647058823529413, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "video_eval_dynamic_pref", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "logical_reasoning_fit_pattern", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "medical_keywords_based_retrieval_non_radiology", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "entertainment_web_game_style", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "game_platform_support_identification", + "score": 0.6071428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ascii_art_understanding", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "scibench_fundamental_wo_solution", + "score": 0.02040816326530612, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "vln_hindi_next_step", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "GUI_Act_Mobile_swipe", + "score": 0.5699819523580396, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_grounding_temporal", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "extract_webpage_headline", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "distinguish_ai_generated_image", + "score": 0.5789473684210527, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_nested_squares", + "score": 0.10714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_cell_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "booking_web_rating", + "score": 0.8928571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "video_eval_visual_pref", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "medical_parasite_detection", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "medical_image_artifacts_indentification", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_tyreworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_overlapped_circle", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "monthly_weather_days_count", + "score": 0.2380952380952381, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_visualization_output_understanding", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 10, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "CLEVRER_physics", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Mobile_tap", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "character_recognition_in_TV_shows", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "circuit_diagram_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "music_sheet_note_count", + "score": 0.11764705882352941, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "MMMU_physics_chemistry_MCQ", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chess_find_legal_moves", + "score": 0.03355324641748354, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_translation_advanced", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "planning_visual_storage", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "multiview_reasoning_camera_moving", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "medical_counting_lymphocytes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_visual_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "code_execution", + "score": 0.0625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_sygyzy_endgames", + "score": 0.09455782312925169, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "video_action_recognition", + "score": 0.6071428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "structured_output", + "num_input": "video" + }, + { + "name": "cultural_vqa", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "music_sheet_author", + "score": 0.1875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "rocks_samples_compare", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "comic_page_ordering", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "hashtag_recommendation", + "score": 0.6476190476190474, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "multilingual_news_qa", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "paper_review_acceptance", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "counting_multi_image", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "photoshop_operation", + "score": 0.08333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "vln_identify_robot", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "9-image or more" + }, + { + "name": "geometry_reasoning_grid", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_visual_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "annoying_word_search", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "healthcare_info_judgement", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Ethical and Safety Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "webpage_code_understanding", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "font_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "relative_reflectance_of_different_regions", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "vln_tegulu_next_step", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "medical_abdomen_endscopy_organ_recognition", + "score": 0.047619047619047616, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "calendar_schedule_suggestion", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "mensa_iq_test", + "score": 0.17647058823529413, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "google_streetview_direction_understanding", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "planning_visual_blocksworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "medical_multi_organ_segmentation_rater", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "product_ocr_qa", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "counting", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_blocksworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "semantic_matching_of_two_images", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "MMMU_pro_exam_screenshot", + "score": 0.10101010101010101, + "eval_type": "rule", + "num_demo": 1, + "num_query": 99, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "chinese_idiom_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_visual_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "ishihara_test", + "score": 0.028571428571428574, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "pokemon_3D_recognition", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "Ad_count_detection", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "code_translation_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "visual_correspondance_in_two_images", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "video_eval_factual_pref", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "number_comparison", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "code_translation_Python", + "score": 0.0625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "song_title_identification_from_lyrics", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "paper_vqa", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_translation_easy", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "TRANCE_physics_reasoning_event", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "polygon_interior_angles", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "play_go_capture_stone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "music_sheet_name", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "medical_content_based_retrieval_radiology", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "mindmap_elements_parsing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_segments_reordering", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "structured_output", + "num_input": "video" + }, + { + "name": "planning_screenshot_storage", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "rebus", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 23, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "music_sheet_format_QA", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "autonomous_driving_scene_analysis", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "worldle", + "score": 0.15426188418265965, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "transit_map_intersection_points", + "score": 0.023809523809523808, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "mahjong", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "Movie_retrieval_by_actor", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "music_sheet_sentiment", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "code_retrieval", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "LaTeX_complex_formula_convertion", + "score": 0.11764705882352941, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_intent_recognition", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "GUI_Act_Web_Single", + "score": 0.068472026925481, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "image_translation_en2cn", + "score": 0.06265503408592417, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "actor_recognition_in_Movie", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "medical_blood_vessels_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "remaining_playback_time_calculation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ancient_map_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "media_recommend_solutions_stackoverflow", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "vln_english_next_step", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "waldo", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "medical_polyp_segmentation_single_object_rater", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "planning_screenshot_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "realworld_qa_en2cn", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "soccer_offside", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "code_match_problem", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "recover_masked_word_in_figure", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "top_video_creator_identification", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "web_action_grounding", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "code_solution_compare", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "arxiv_vqa", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "interpret_force_perspective_illusion", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "orchestra_score_recognition", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_grounding_spatial", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "logical_reasoning_2d_folding", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "GUI_Act_Web_Multi", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "llavaguard", + "score": 0.39285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "google_streetview_circle_reasoning", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "TV_show_retrieval_by_character", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "icon_arithmetic_puzzle", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "vln_identify_location", + "score": 0.10303030303030303, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "planning_visual_grippers", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "geometry_reasoning_count_line_intersections", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_abdomen_MRI_organ_recognition", + "score": 0.09523809523809523, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "web_action_prediction", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "knowledge_graph_understanding", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "knowledge_sign_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "Forensic_Detection_of_different_images", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "position_relationship", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "Bongard_Problem", + "score": 0.10526315789473684, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_circled_letter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_2D_views_of_3D_shapes", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "dish_ingredient_match", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "media_homepage_profile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "video_camera_motion_description", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "exact_text", + "num_input": "video" + }, + { + "name": "google_streetview_line_sorting", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "move_pos_to_pos_hanoi_4_pole", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "flowchart_code_generation", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "topological_sort", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "google_streetview_line_reasoning", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "TRANCE_physics_reasoning_view", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "planning_screenshot_grippers", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_retrieval_given_surgeon_activity", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Videos", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "TRANCE_physics_reasoning_basic", + "score": 0.29411764705882354, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "code_add_tag", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "weather_map_climate_type_temperature_parsing", + "score": 0.4642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "highest_discount_game_price_identification", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "media_QA_web_stackoverflow", + "score": 0.33333333333333337, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "newspaper_ocr_in_query_box", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "electricity_plot_future_prediction", + "score": 0.36160000000000003, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "coco_person_detection", + "score": 0.5028221142675466, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geometry_length", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "weather_info_parsing", + "score": 0.734126984126984, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "av_view_identification", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "physical_property_reasoning", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "face_keypoint_detection", + "score": 0.3707918542570349, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_maxflow", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "av_vehicle_multiview_counting", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "6-8 images" + }, + { + "name": "geometry_transformation", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "chess_puzzle_single_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ili_ratio_future_prediction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geometry_analytic", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "mvsa_sentiment_classification", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "graph_theory", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "clevrer_video_moving_object_property_recognition", + "score": 0.6875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "clevrer_video_moving_object_count", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 21, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "video" + }, + { + "name": "sta_action_localization_video", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "code_programming_test_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "graph_hamiltonian_cycle", + "score": 0.03214285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "3d_indoor_scene_text_bbox_selection", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "vizwiz_quality_accessment_for_blind", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "funsd_document_qa", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "single_person_pose_estimation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "deciphering_oracle_bone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "license_plate_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "science_molecule_chemistry", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "snli_ve_visual_entailment", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "table_understanding", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geometry_solid", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "tqa_textbook_qa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "music_info_parsing", + "score": 0.4375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_area", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "landmark_check_two_images", + "score": 0.15555555555555553, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "dvqa", + "score": 0.6842105263157895, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "cheapest_flight_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "code_programming_extremely_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Planning and Decision Making" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "widerface_face_count_and_event_classification", + "score": 0.17857142857142858, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "movie_info_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "long_string_letter_recognition", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chess_winner_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "math_convexity_value_estimation", + "score": 0.1691847451654077, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "multi_load_type_prediction_from_plot", + "score": 0.3095238095238095, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "ti_fused_vqa_math", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "human_relationship_reasoning", + "score": 0.3125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Commonsense and Social Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "vibe_eval_short_phrase", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "av_multicamera_tracking_predict_bbox", + "score": 0.034222739980969856, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "9-image or more" + }, + { + "name": "perception_test_video_character_order", + "score": 0.125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "question_solution_solving", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "game_info_parsing", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "top_rated_hotel_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "nextqa_mc", + "score": 0.7894736842105263, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "stock_price_future_prediction", + "score": 0.28414285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "algebra", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "graph_shortest_path_planar", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "action_sequence", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "9-image or more" + }, + { + "name": "map_diagram_qa", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "places365_similar_scene_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "graph_shortest_path_kamada_kawai", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "star_object_interaction_video", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "graph_chordless_cycle", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "waybill_number_sequence_extraction", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "iconqa", + "score": 0.2631578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "graph_connectivity", + "score": 0.08333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "mnist_pattern", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "clevrer_moving_direction_video", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "egocentric_analysis_single_image", + "score": 0.2222222222222222, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "image_style_recognition", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "weather_info_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "geometry_descriptive", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "IAM_line_ocr_and_locate", + "score": 0.31341468102097647, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "electricity_load_estimate_plot", + "score": 0.4668571428571428, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "insect_order_classification", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "perception_test_object_shuffle_video", + "score": 0.0625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "muma_theory_of_mind_social_goal", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "ti_fused_vqa_biology", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "youtube_video_info_parsing", + "score": 0.5952380952380952, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "muma_theory_of_mind_belief_of_goal", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "quizlet_question_solving", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_chemistry", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "clevr_arithmetic", + "score": 0.3684210526315789, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "exchange_rate_estimate_plot", + "score": 0.7238999999999999, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "multilingual_movie_info_parsing", + "score": 0.6224489795918366, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "hotel_booking_confirmation_parsing", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_programming_test_easy", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 24, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "brand_logo_recognition_and_elaboration", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "arc_agi", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "movie_info_parsing", + "score": 0.6696428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "3d_indoor_scene_text_bbox_prediction", + "score": 0.04162492664914846, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "video_to_camera_trajectory_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "TV_show_info_parsing", + "score": 0.6031746031746031, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "stock_info_parsing", + "score": 0.9201680672268909, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "math_parity", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "visualdial_visual_dialog_image_guessing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "famous_building_recognition", + "score": 0.4375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "egocentric_spatial_reasoning", + "score": 0.2222222222222222, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "nlvr2_two_image_compare_qa", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "vlnqa_egocentric_navigation_video", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "graph_isomorphism", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "code_output_result", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "landmark_recognition_and_qa", + "score": 0.35555555555555557, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "face_identity_matching", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "4-5 images" + }, + { + "name": "music_info_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "clevrer_object_existence_video", + "score": 0.4375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "cam_traj_to_video_selection", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "video_content_reasoning", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "3d_fragments_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "handwritten_math_expression_extraction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_physics", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "pmc_vqa_medical_image_qa", + "score": 0.47368421052631576, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "long_string_number_recognition", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "electricity_future_prediction_from_table", + "score": 0.5305263157894736, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "places365_scene_type_classification", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_error_line_identification", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "code_programming_test_advanced", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "super_clevr", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "animal_pose_estimation", + "score": 0.1433110081487998, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "logo2k_same_type_logo_retrieval", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "science_basic_physics", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "kvqa_knowledge_aware_qa", + "score": 0.3684210526315789, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "math_breakpoint", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "newspaper_page_parse_and_count", + "score": 0.24444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "multilingual_game_info_parsing", + "score": 0.5178571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "signboard_identification", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "chart_vqa", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "action_prediction", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "average_humidity_estimate_plot", + "score": 0.2286666666666668, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "photo_sharing_image_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "humor_understand_caption_match", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "game_info_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "emotion_recognition", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "6-8 images" + }, + { + "name": "figureqa", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "av_human_multiview_counting", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "6-8 images" + }, + { + "name": "perception_test_video_action_count", + "score": 0.375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "video" + }, + { + "name": "stock_info_retrieval", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "coco_object_detection_by_query_property", + "score": 0.4155970724620079, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "graph_hamiltonian_path", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "coco_ood_global_image_retrieval_by_query_property", + "score": 0.3239260739260739, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "traffic_future_prediction_from_line_plot", + "score": 0.513578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_crushing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_equality", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_checkmate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_papers", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_indian_celebrity", + "score": 0.10714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_chinese_celebrity", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_famous_treaty", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "reward_models_T2I_reward", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "reward_models_I2T_reward", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "crossword_mini_5x5", + "score": 0.014285714285714287, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_motion_matching_real_3D", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "video_motion_matching_3D_real", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "number_puzzle_sudoku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "number_puzzle_kakuro_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "panel_images_single_question", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "panel_images_multi_question", + "score": 0.42857142857142866, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "table_understanding_complex_question_answering", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "table_understanding_fact_verification", + "score": 0.488095238095238, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_position_images", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_xor_images", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_repeat_length", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_contain_length", + "score": 1.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_multi_contain_position_only", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_length", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_contain_images", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_multi_contain_repeat", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_position_length", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_multi_contain_repeat_position_only_length", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "maze_2d_8x8", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_PolitiFact", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_GossipCop", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Object Recognition and Classification", + "Scene and Event Understanding", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_HatefulMemes", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Memotion", + "score": 0.6588235294117649, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_latex", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_csv", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_markdown", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_experience_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_employer_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_article_authors", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_article_journal", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_skill_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_text_latex", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_equation", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_html", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_school_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "pictionary_doodle_guess", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_chinese_food_img2en", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_cartoon_drawing_guess", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_genai_output_chinese", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_skribbl_io", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "counterfactual_arithmetic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_youtube", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_iphone_settings", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_notes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_word", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_alipay", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_twitter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_leetcode", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_tiktok", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_excel", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_instagram", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_ppt", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_zoom", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_amazon", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_layout_understanding_youtube", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_iphone_settings", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_notes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_word", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_alipay", + "score": 0.058823529411764705, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_twitter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_leetcode", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_tiktok", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_excel", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_instagram", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_ppt", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_zoom", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_amazon", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_attribute_edit", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_out_of_context", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_background_change", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_entity_replace", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_clip_stable_diffusion_generate", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_style", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_veracity", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_swap", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_blogpost", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_homepage", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_publication", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "autorater_3d_model_texturing", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Metrics", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_subject", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_unmask", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_artifact_reason", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "autorater_mask", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_aesthetics", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_motion_guided_editing", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "autorater_artifact", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "autorater_control", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_semantics", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "shape_composition_shapes", + "score": 0.20493197278911562, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "shape_composition_colours", + "score": 0.07681405895691609, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_math_TheoremQA", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_MATH", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "ball_cup_swap_3", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "poetry_petrarchian_sonnet_optional_meter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_shakespearean_sonnet", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_custom_rhyming_scheme", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_haiku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic_alliteration", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_limerick", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "geographic_remote_sensing_land_cover", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "visual_prediction_rater_novel_view_synthesis", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_semantic_segmentation", + "score": 0.10416666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_panoptic_segmentation", + "score": 0.09523809523809523, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_surface_normal_estimation", + "score": 0.09523809523809523, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_openable_part_segmentation", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_plane_segmentation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_depth_estimation", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_3d_assembled_quality_understanding", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "symbolic_graphics_programs_computer_aided_design", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "symbolic_graphics_programs_scalable_vector_graphics", + "score": 0.05555555555555555, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_depth", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_relation", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_distance", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_count", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_africa", + "score": 0.028571428571428574, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_asia", + "score": 0.014285714285714287, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_americas", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_europe", + "score": 0.014285714285714287, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "figurative_speech_explanation", + "score": 0.3482758620689654, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "tweets_captioning", + "score": 0.17142857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "science_figure_explanation", + "score": 0.2689655172413793, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "iq_test", + "score": 0.21379310344827587, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "sceneqa_scene_transition_video", + "score": 0.2285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "wikihow_complex_task_completion", + "score": 0.15555555555555556, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "9-image or more" + }, + { + "name": "funqa_unexpected_action_humor_video", + "score": 0.24666666666666665, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "image_captioning_with_additional_requirements", + "score": 0.46428571428571436, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "visualization_with_code", + "score": 0.1928571428571429, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocrqa", + "score": 0.5689655172413793, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "activitynetqa", + "score": 0.4789473684210526, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "generated_video_artifacts", + "score": 0.1125, + "eval_type": "llm", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "paper_review_writing", + "score": 0.4799999999999999, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "4-5 images" + }, + { + "name": "table2latex_complex", + "score": 0.1111111111111111, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "funqa_unexpected_action_creative_video", + "score": 0.22666666666666666, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "graph_interpretation", + "score": 0.28620689655172404, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "defeasible_reasoning", + "score": 0.42758620689655163, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "image_humor_understanding", + "score": 0.4206896551724139, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "doc_vqa", + "score": 0.7187500000000002, + "eval_type": "llm", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "4-5 images" + }, + { + "name": "video2notes", + "score": 0.15, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "bar_chart_interpretation", + "score": 0.31379310344827593, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_short_title", + "score": 0.557142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "humor_explanation", + "score": 0.3533333333333333, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "nextqa_oe", + "score": 0.3, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "vibe-eval", + "score": 0.47857142857142854, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_summary", + "score": 0.2857142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "art_explanation", + "score": 0.3482758620689655, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "traffic_accident_analysis", + "score": 0.0071428571428571435, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_content_follow_up", + "score": 0.3714285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Planning and Decision Making" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "electrocardiogram", + "score": 0.14285714285714285, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_qa", + "score": 0.49999999999999994, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "GUI_Chat_Easy", + "score": 0.55, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funny_image_title", + "score": 0.15, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "scibench_w_solution_open_ended", + "score": 0.07, + "eval_type": "llm", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "meme_explain", + "score": 0.09285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "GUI_Chat_Hard", + "score": 0.4548387096774193, + "eval_type": "llm", + "num_demo": 1, + "num_query": 31, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funqa_unexpected_action_magic_video", + "score": 0.3733333333333333, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "unusual_images", + "score": 0.3206896551724138, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "guess_image_generation_prompt", + "score": 0.7631578947368421, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_detail_description", + "score": 0.39473684210526305, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "docci_image_description_long", + "score": 0.45, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "ascii_art_30", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Spanish", + "score": 0.15000000000000005, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_French", + "score": 0.11428571428571431, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Russian", + "score": 0.04285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_English", + "score": 0.2642857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Arabic", + "score": 0.09285714285714287, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Japanese", + "score": 0.049999999999999996, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "table_understanding_fetaqa", + "score": 0.4142857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_advanced", + "score": 0.10714285714285716, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_expert", + "score": 0.36428571428571427, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_worldclass", + "score": 0.049999999999999996, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Spanish", + "score": 0.04285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_French", + "score": 0.04285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Russian", + "score": 0.02142857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Chinese", + "score": 0.17857142857142858, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Arabic", + "score": 0.03571428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "red_teaming_visual_order_B", + "score": 0.8789473684210528, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "red_teaming_captcha", + "score": 0.12105263157894743, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visualmisleading", + "score": 0.8631578947368422, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_celebrity", + "score": 0.6900000000000002, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_politics", + "score": 0.62, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_jailbreak", + "score": 0.30999999999999994, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Text Recognition (OCR)", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visual_order_A", + "score": 0.8736842105263161, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "red_teaming_racial", + "score": 0.625, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + } +] \ No newline at end of file diff --git a/static/eval_results/Default/Qwen2_VL_72B/summary_results.json b/static/eval_results/Default/Qwen2_VL_72B/summary_results.json new file mode 100644 index 0000000000000000000000000000000000000000..21ca30c7af4a3ac146431c7c2c9ef8774c9fdd1e --- /dev/null +++ b/static/eval_results/Default/Qwen2_VL_72B/summary_results.json @@ -0,0 +1,251 @@ +{ + "model_summary": { + "core": { + "num_eval_tasks": 440, + "num_eval_samples": 6539, + "macro_mean_score": 0.4542376574527161, + "micro_mean_score": 0.4501201906164793 + }, + "open": { + "num_eval_tasks": 65, + "num_eval_samples": 1163, + "macro_mean_score": 0.5639771804231668, + "micro_mean_score": 0.5835339638865004 + }, + "overall_score": 0.4683625465479226 + }, + "keyword_stats": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.48669152179713876 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.5291932917937967 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.53654503409075 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.4931554892760308 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.3908023665629473 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.5668846347262286 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.6121127819548872 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.4493794346300551 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.33622171962424363 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.21642754068858566 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.5263730250833892 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.42759570727857965 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.4228561177227288 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.4780253686541936 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.5070774860945021 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.4807292191169126 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.38847545874852984 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.4359156358804688 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.43781407268698613 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.49080138099759946 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.42481004254128113 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.5132810622684265 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.5062248706593999 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.3063303099017385 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.523959576707116 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.4879791577413812 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.4780253686541936 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.34846161336322395 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.44101149919132854 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.5663587858366833 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.3067825586087303 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.4121566368482877 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.5176521211872086 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.5030444649397028 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.45616267568458396 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.5047683071464567 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.3553838743540432 + } + } + } +} \ No newline at end of file diff --git a/static/eval_results/Default/Qwen2_VL_72B/task_results.json b/static/eval_results/Default/Qwen2_VL_72B/task_results.json new file mode 100644 index 0000000000000000000000000000000000000000..8a17c72937237724cf8c14214bfd7fc050e640b8 --- /dev/null +++ b/static/eval_results/Default/Qwen2_VL_72B/task_results.json @@ -0,0 +1,7756 @@ +[ + { + "name": "face_identity_matching", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "4-5 images" + }, + { + "name": "IAM_line_ocr_and_locate", + "score": 0.735856751092166, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "science_molecule_chemistry", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "weather_info_retrieval", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "signboard_identification", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "av_multicamera_tracking_predict_bbox", + "score": 0.08008270592840953, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "9-image or more" + }, + { + "name": "funsd_document_qa", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "physical_property_reasoning", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "geometry_area", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geometry_analytic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "map_diagram_qa", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "clevr_arithmetic", + "score": 0.6842105263157895, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "graph_connectivity", + "score": 0.38333333333333336, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "visualdial_visual_dialog_image_guessing", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "graph_isomorphism", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "clevrer_object_existence_video", + "score": 0.625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "question_solution_solving", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "snli_ve_visual_entailment", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_chemistry", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ili_ratio_future_prediction", + "score": 0.05299999999999997, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "image_style_recognition", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "multilingual_movie_info_parsing", + "score": 0.6938775510204082, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "star_object_interaction_video", + "score": 0.5625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "video_content_reasoning", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "geometry_transformation", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "places365_similar_scene_retrieval", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "vibe_eval_short_phrase", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "cam_traj_to_video_selection", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "math_breakpoint", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "photo_sharing_image_retrieval", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "perception_test_video_character_order", + "score": 0.1875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "coco_ood_global_image_retrieval_by_query_property", + "score": 0.6987244897959183, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "arc_agi", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "vizwiz_quality_accessment_for_blind", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "av_human_multiview_counting", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "6-8 images" + }, + { + "name": "waybill_number_sequence_extraction", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "emotion_recognition", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "6-8 images" + }, + { + "name": "brand_logo_recognition_and_elaboration", + "score": 0.78, + "eval_type": "rule", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "license_plate_recognition", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chess_puzzle_single_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_programming_test_easy", + "score": 0.2708333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 24, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_winner_identification", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chart_vqa", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "math_convexity_value_estimation", + "score": 0.46765174197558695, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "3d_indoor_scene_text_bbox_prediction", + "score": 0.16496674091169844, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "long_string_letter_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "dvqa", + "score": 0.8947368421052632, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "egocentric_spatial_reasoning", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "music_info_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "quizlet_question_solving", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "coco_person_detection", + "score": 0.7827708359722172, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "clevrer_moving_direction_video", + "score": 0.3125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "graph_maxflow", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "game_info_parsing", + "score": 0.9480519480519481, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_hamiltonian_cycle", + "score": 0.33035714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "av_vehicle_multiview_counting", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "6-8 images" + }, + { + "name": "animal_pose_estimation", + "score": 0.34596078503152894, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "humor_understand_caption_match", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "super_clevr", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "average_humidity_estimate_plot", + "score": 0.8039999999999998, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "nlvr2_two_image_compare_qa", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "sta_action_localization_video", + "score": 0.375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "face_keypoint_detection", + "score": 0.6697279131770494, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "insect_order_classification", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "deciphering_oracle_bone", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "places365_scene_type_classification", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "science_basic_physics", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "perception_test_object_shuffle_video", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "long_string_number_recognition", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_programming_test_hard", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "video_to_camera_trajectory_retrieval", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "cheapest_flight_identification", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "newspaper_ocr_in_query_box", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "stock_info_retrieval", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "graph_theory", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "landmark_recognition_and_qa", + "score": 0.5777777777777777, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "muma_theory_of_mind_social_goal", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "clevrer_video_moving_object_property_recognition", + "score": 0.9375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "action_sequence", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "9-image or more" + }, + { + "name": "game_info_retrieval", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "av_view_identification", + "score": 0.11111111111111112, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "code_programming_test_advanced", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "muma_theory_of_mind_belief_of_goal", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "code_programming_extremely_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Planning and Decision Making" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "geometry_solid", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "top_rated_hotel_identification", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "stock_info_parsing", + "score": 0.9747899159663866, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "pmc_vqa_medical_image_qa", + "score": 0.7368421052631579, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "3d_fragments_understanding", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "electricity_future_prediction_from_table", + "score": 0.7784210526315789, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "TV_show_info_parsing", + "score": 0.9047619047619049, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "coco_object_detection_by_query_property", + "score": 0.7404077001845016, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "vlnqa_egocentric_navigation_video", + "score": 0.375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "landmark_check_two_images", + "score": 0.7777777777777779, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "figureqa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_biology", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "traffic_future_prediction_from_line_plot", + "score": 0.7639999999999998, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "mvsa_sentiment_classification", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "stock_price_future_prediction", + "score": 0.7441428571428573, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multi_load_type_prediction_from_plot", + "score": 0.47619047619047616, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "hotel_booking_confirmation_parsing", + "score": 0.65, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_shortest_path_planar", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "action_prediction", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "code_error_line_identification", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "math_parity", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "famous_building_recognition", + "score": 0.8125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "multilingual_game_info_parsing", + "score": 0.8839285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "weather_info_parsing", + "score": 0.9603174603174605, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_shortest_path_kamada_kawai", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "widerface_face_count_and_event_classification", + "score": 0.6071428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "nextqa_mc", + "score": 0.8421052631578947, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "ti_fused_vqa_physics", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "movie_info_retrieval", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "graph_chordless_cycle", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "clevrer_video_moving_object_count", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 21, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "video" + }, + { + "name": "geometry_length", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "algebra", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "human_relationship_reasoning", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Commonsense and Social Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "exchange_rate_estimate_plot", + "score": 0.9812571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "electricity_load_estimate_plot", + "score": 0.5142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "logo2k_same_type_logo_retrieval", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "single_person_pose_estimation", + "score": 0.30314505117346885, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_math", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "electricity_plot_future_prediction", + "score": 0.7969263157894737, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "graph_hamiltonian_path", + "score": 0.318452380952381, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "iconqa", + "score": 0.5263157894736842, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "movie_info_parsing", + "score": 0.7410714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "youtube_video_info_parsing", + "score": 0.8333333333333334, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "egocentric_analysis_single_image", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "perception_test_video_action_count", + "score": 0.4375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "video" + }, + { + "name": "music_info_parsing", + "score": 0.7232142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "mnist_pattern", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "newspaper_page_parse_and_count", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_descriptive", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_relation", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_depth", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_distance", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_count", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geographic_remote_sensing_land_cover", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "symbolic_graphics_programs_computer_aided_design", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "symbolic_graphics_programs_scalable_vector_graphics", + "score": 0.1111111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_asia", + "score": 0.12857142857142856, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_americas", + "score": 0.15714285714285717, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_europe", + "score": 0.1142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_africa", + "score": 0.09999999999999999, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "visual_prediction_rater_plane_segmentation", + "score": 0.2222222222222222, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_novel_view_synthesis", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_3d_assembled_quality_understanding", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "visual_prediction_rater_semantic_segmentation", + "score": 0.2708333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_panoptic_segmentation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_surface_normal_estimation", + "score": 0.16666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_depth_estimation", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_openable_part_segmentation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_correspondance_in_two_images", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "google_streetview_circle_reasoning", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "vln_tegulu_next_step", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "flowchart_code_generation", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "video_eval_factual_pref", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "medical_polyp_segmentation_single_object_rater", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "recipe_image_ordering", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "healthcare_info_judgement", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Ethical and Safety Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "relative_depth_of_different_points", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "planning_visual_storage", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_visualization_output_understanding", + "score": 0.3, + "eval_type": "rule", + "num_demo": 1, + "num_query": 10, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "Ad_count_detection", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "TRANCE_physics_reasoning_view", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "video_grounding_temporal", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "chess_sygyzy_endgames", + "score": 0.07714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_visual_blocksworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "web_action_grounding", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "position_relationship", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Web_Multi", + "score": 0.4411444596352561, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "webpage_code_understanding", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "vln_english_next_step", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "logical_reasoning_2d_folding", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "extract_webpage_headline", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_translation_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "music_sheet_note_count", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_circled_letter", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_find_odd_one_out", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "paper_vqa", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "worldle", + "score": 0.32276496873129895, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "medical_keywords_based_retrieval_non_radiology", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "remaining_playback_time_calculation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "sign_language", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "video_camera_motion_description", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "exact_text", + "num_input": "video" + }, + { + "name": "video_segments_reordering", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "structured_output", + "num_input": "video" + }, + { + "name": "interpret_force_perspective_illusion", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Web_Single", + "score": 0.08595213084035436, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "hashtag_recommendation", + "score": 0.8583333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_tyreworld", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_grounding_spatial", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "weather_map_climate_type_temperature_parsing", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_match_problem", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "music_sheet_sentiment", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "scibench_calculus_wo_solution", + "score": 0.22448979591836735, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_eval_visual_pref", + "score": 0.6875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "image_translation_en2cn", + "score": 0.49168715680336533, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "game_platform_support_identification", + "score": 0.8214285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "music_sheet_format_QA", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "CLEVRER_physics", + "score": 0.45, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "location_vqa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "semantic_matching_of_two_images", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "planning_visual_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "geometry_reasoning_nested_squares", + "score": 0.39285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "circuit_diagram_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "scibench_fundamental_wo_solution", + "score": 0.1836734693877551, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_visual_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "planning_visual_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "code_retrieval", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "planning_screenshot_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_blocksworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_storage", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "distinguish_ai_generated_image", + "score": 0.6842105263157895, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_count_line_intersections", + "score": 0.6071428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_grippers", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "booking_web_recommendation", + "score": 0.47891156462585033, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "photoshop_operation", + "score": 0.17261904761904762, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "llavaguard", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "media_homepage_profile", + "score": 0.3062298603651987, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "vln_hindi_next_step", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "media_recommend_solutions_stackoverflow", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "dish_ingredient_match", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "google_streetview_direction_understanding", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "music_sheet_name", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "code_translation_easy", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "ancient_map_understanding", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "mindmap_elements_parsing", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "autonomous_driving_scene_analysis", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_2D_views_of_3D_shapes", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "paper_review_acceptance", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "rocks_samples_compare", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "google_streetview_line_reasoning", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "orchestra_score_recognition", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "highest_discount_game_price_identification", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "video_intent_recognition", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "counting", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "media_QA_web_stackoverflow", + "score": 0.6666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_solution_compare", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "soccer_offside", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "2d_image_jigsaw_puzzle_easy", + "score": 0.25476190476190474, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "medical_content_based_retrieval_radiology", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "song_title_identification_from_lyrics", + "score": 0.5357142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MMMU_pro_exam_screenshot", + "score": 0.35353535353535354, + "eval_type": "rule", + "num_demo": 1, + "num_query": 99, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "multiview_reasoning_camera_moving", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "medical_multi_organ_segmentation_rater", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "knowledge_graph_understanding", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "rocks_samples_identify", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "mensa_iq_test", + "score": 0.44313725490196076, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "play_go_capture_stone", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "booking_web_rating", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "web_action_prediction", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "top_video_creator_identification", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_visual_grippers", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "annoying_word_search", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Mobile_tap", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "comic_page_ordering", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "geometry_reasoning_grid", + "score": 0.39285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "TRANCE_physics_reasoning_event", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "planning_screenshot_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Mobile_swipe", + "score": 0.29485155847974215, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "recover_masked_word_in_figure", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "vln_identify_robot", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "9-image or more" + }, + { + "name": "multilingual_news_qa", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "move_pos_to_pos_hanoi_4_pole", + "score": 0.10714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "relative_reflectance_of_different_regions", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "counting_multi_image", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "code_translation_advanced", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "signage_navigation", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "transit_map_intersection_points", + "score": 0.519047619047619, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_execution", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "icon_arithmetic_puzzle", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_overlapped_circle", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "google_streetview_circle_sorting", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "chinese_idiom_recognition", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "knowledge_sign_recognition", + "score": 0.1111111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "monthly_weather_days_count", + "score": 0.5476190476190476, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "calendar_schedule_suggestion", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "music_sheet_author", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "ascii_art_understanding", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "chess_find_legal_moves", + "score": 0.022873657642764916, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "topological_sort", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "entertainment_web_game_style", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_action_recognition", + "score": 0.75, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "structured_output", + "num_input": "video" + }, + { + "name": "functionality_matching_in_different_objects", + "score": 0.5357142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "Movie_retrieval_by_actor", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "TV_show_retrieval_by_character", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "Forensic_Detection_of_different_images", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "polygon_interior_angles", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "pokemon_3D_recognition", + "score": 0.6166666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "mahjong", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "font_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "paper_review_rating", + "score": 0.6989250130797713, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "numerical_data", + "num_input": "4-5 images" + }, + { + "name": "number_comparison", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "actor_recognition_in_Movie", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "TRANCE_physics_reasoning_basic", + "score": 0.7058823529411765, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "waldo", + "score": 0.002625410220346934, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "google_streetview_line_sorting", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "video_eval_dynamic_pref", + "score": 0.75, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "product_ocr_qa", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "Bongard_Problem", + "score": 0.21052631578947367, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_translation_Python", + "score": 0.4166666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "logical_reasoning_fit_pattern", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "road_map_find_highway_between_two_place", + "score": 0.5294117647058824, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ishihara_test", + "score": 0.15714285714285717, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_parasite_detection", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "LaTeX_complex_formula_convertion", + "score": 0.29411764705882354, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "vln_identify_location", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "rebus", + "score": 0.17391304347826086, + "eval_type": "rule", + "num_demo": 1, + "num_query": 23, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "constrained_generation_multi_contain_repeat_position_only_length", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_multi_contain_repeat", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_position_length", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_xor_images", + "score": 1.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_position_images", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_repeat_length", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_contain_length", + "score": 1.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_multi_contain_position_only", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_contain_images", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_length", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_equality", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_crushing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_checkmate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "panel_images_single_question", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "panel_images_multi_question", + "score": 0.9047619047619048, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_tiktok", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_notes", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_alipay", + "score": 0.5882352941176471, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_amazon", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_instagram", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_ppt", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_leetcode", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_twitter", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_zoom", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_iphone_settings", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_youtube", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_excel", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_word", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "crossword_mini_5x5", + "score": 0.44285714285714295, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MMSoc_HatefulMemes", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_GossipCop", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Object Recognition and Classification", + "Scene and Event Understanding", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_PolitiFact", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Memotion", + "score": 0.6000000000000002, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "reward_models_I2T_reward", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "reward_models_T2I_reward", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "ocr_math_MATH", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "ocr_math_TheoremQA", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "counterfactual_arithmetic", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_homepage", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_publication", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_blogpost", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "table_understanding_complex_question_answering", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "table_understanding_fact_verification", + "score": 0.8452380952380952, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_swap", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_style", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_clip_stable_diffusion_generate", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_veracity", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_attribute_edit", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_entity_replace", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_out_of_context", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_background_change", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "video_motion_matching_3D_real", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "video_motion_matching_real_3D", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "number_puzzle_kakuro_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "number_puzzle_sudoku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ball_cup_swap_3", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "maze_2d_8x8", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "shape_composition_shapes", + "score": 0.4263605442176871, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "shape_composition_colours", + "score": 0.37137188208616784, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_chinese_celebrity", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_indian_celebrity", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_papers", + "score": 0.43333333333333335, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_famous_treaty", + "score": 0.5357142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_csv", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_school_plain", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_text_latex", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_skill_plain", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_experience_plain", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_article_authors", + "score": 0.75, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_markdown", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_html", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_employer_plain", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_equation", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_article_journal", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_latex", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "autorater_subject", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_artifact", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "autorater_control", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_aesthetics", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_mask", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_artifact_reason", + "score": 0.4, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "autorater_semantics", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_motion_guided_editing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "autorater_3d_model_texturing", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Metrics", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_unmask", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "poetry_haiku", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_limerick", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_petrarchian_sonnet_optional_meter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_custom_rhyming_scheme", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic_alliteration", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_shakespearean_sonnet", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "pictionary_cartoon_drawing_guess", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_doodle_guess", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_skribbl_io", + "score": 0.15, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_genai_output_chinese", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_chinese_food_img2en", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_tiktok", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_alipay", + "score": 0.7058823529411765, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_amazon", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_instagram", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_ppt", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_leetcode", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_twitter", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_zoom", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_iphone_settings", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_youtube", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_excel", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_word", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "medical_abdomen_MRI_organ_recognition", + "score": 0.38095238095238093, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "medical_abdomen_endscopy_organ_recognition", + "score": 0.2738095238095238, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "medical_blood_vessels_recognition", + "score": 0.6071428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_cell_recognition", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "medical_retrieval_given_surgeon_activity", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Videos", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "medical_image_artifacts_indentification", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "medical_counting_lymphocytes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "character_recognition_in_TV_shows", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "cultural_vqa", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "tqa_textbook_qa", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_output_result", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "kvqa_knowledge_aware_qa", + "score": 0.5789473684210527, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "painting_QA", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "MMMU_physics_chemistry_MCQ", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "realworld_qa_en2cn", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "arxiv_vqa", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "code_add_tag", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "3d_indoor_scene_text_bbox_selection", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "table_understanding", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "handwritten_math_expression_extraction", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_notes", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "figurative_speech_explanation", + "score": 0.7586206896551724, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "tweets_captioning", + "score": 0.5642857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "science_figure_explanation", + "score": 0.7758620689655172, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "iq_test", + "score": 0.4724137931034481, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "sceneqa_scene_transition_video", + "score": 0.2785714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "wikihow_complex_task_completion", + "score": 0.6444444444444444, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "9-image or more" + }, + { + "name": "funqa_unexpected_action_humor_video", + "score": 0.4, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "image_captioning_with_additional_requirements", + "score": 0.8571428571428573, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "visualization_with_code", + "score": 0.4928571428571429, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocrqa", + "score": 0.8206896551724137, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "activitynetqa", + "score": 0.5894736842105264, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "generated_video_artifacts", + "score": 0.21875, + "eval_type": "llm", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "paper_review_writing", + "score": 0.5533333333333332, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "4-5 images" + }, + { + "name": "table2latex_complex", + "score": 0.7555555555555555, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "funqa_unexpected_action_creative_video", + "score": 0.31333333333333335, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "graph_interpretation", + "score": 0.7620689655172413, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "defeasible_reasoning", + "score": 0.7172413793103449, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "image_humor_understanding", + "score": 0.7793103448275861, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "doc_vqa", + "score": 0.8, + "eval_type": "llm", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "4-5 images" + }, + { + "name": "video2notes", + "score": 0.6, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "bar_chart_interpretation", + "score": 0.6413793103448274, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_short_title", + "score": 0.6571428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "humor_explanation", + "score": 0.5466666666666666, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "nextqa_oe", + "score": 0.32105263157894737, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "vibe-eval", + "score": 0.6928571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_summary", + "score": 0.6142857142857142, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "art_explanation", + "score": 0.6206896551724139, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "traffic_accident_analysis", + "score": 0.5642857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_content_follow_up", + "score": 0.7857142857142858, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Planning and Decision Making" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "electrocardiogram", + "score": 0.37857142857142856, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_qa", + "score": 0.7428571428571429, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "GUI_Chat_Easy", + "score": 0.7653846153846156, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funny_image_title", + "score": 0.7499999999999999, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "scibench_w_solution_open_ended", + "score": 0.338, + "eval_type": "llm", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "meme_explain", + "score": 0.7, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "GUI_Chat_Hard", + "score": 0.6096774193548388, + "eval_type": "llm", + "num_demo": 1, + "num_query": 31, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funqa_unexpected_action_magic_video", + "score": 0.4866666666666666, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "unusual_images", + "score": 0.6724137931034484, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "guess_image_generation_prompt", + "score": 0.8, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_detail_description", + "score": 0.4789473684210525, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "docci_image_description_long", + "score": 0.75, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "ascii_art_30", + "score": 0.21428571428571427, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Spanish", + "score": 0.45714285714285713, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_French", + "score": 0.37142857142857133, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Russian", + "score": 0.5142857142857142, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_English", + "score": 0.5857142857142856, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Arabic", + "score": 0.46428571428571425, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Japanese", + "score": 0.5928571428571427, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "table_understanding_fetaqa", + "score": 0.6285714285714284, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_advanced", + "score": 0.1785714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_expert", + "score": 0.23571428571428577, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_worldclass", + "score": 0.2857142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Spanish", + "score": 0.39285714285714285, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_French", + "score": 0.3714285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Russian", + "score": 0.32142857142857134, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Chinese", + "score": 0.4428571428571429, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Arabic", + "score": 0.3071428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "red_teaming_visual_order_B", + "score": 0.8842105263157897, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "red_teaming_captcha", + "score": 0.10000000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visualmisleading", + "score": 0.905263157894737, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_celebrity", + "score": 0.5950000000000001, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_politics", + "score": 0.6049999999999999, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_jailbreak", + "score": 0.56, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Text Recognition (OCR)", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visual_order_A", + "score": 0.8842105263157897, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "red_teaming_racial", + "score": 0.69, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + } +] \ No newline at end of file diff --git a/static/eval_results/Default/Qwen2_VL_7B/summary_results.json b/static/eval_results/Default/Qwen2_VL_7B/summary_results.json new file mode 100644 index 0000000000000000000000000000000000000000..a67230b05e5f3234888e722eab28419c004ee575 --- /dev/null +++ b/static/eval_results/Default/Qwen2_VL_7B/summary_results.json @@ -0,0 +1,251 @@ +{ + "model_summary": { + "core": { + "num_eval_tasks": 440, + "num_eval_samples": 6539, + "macro_mean_score": 0.3293449599230247, + "micro_mean_score": 0.325331493515679 + }, + "open": { + "num_eval_tasks": 65, + "num_eval_samples": 1170, + "macro_mean_score": 0.43955105763038577, + "micro_mean_score": 0.45508547008546996 + }, + "overall_score": 0.34352990319228904 + }, + "keyword_stats": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.3506773570484231 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.38363163370919123 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2511, + "tasks": [], + "average_score": 0.3882785389756705 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2469, + "tasks": [], + "average_score": 0.38292659892379843 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.2730765188348748 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.4625711182912848 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.5287318295739348 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.32297080808954215 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2439, + "tasks": [], + "average_score": 0.2561357336105554 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.12651411144309255 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.35229497847636093 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.2881996369284258 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.3162917354476226 + }, + "Videos": { + "count": 43, + "num_samples": 700, + "tasks": [], + "average_score": 0.3555910609857979 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.3513518594470202 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.39509504888372243 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.19173322639974366 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.3118818521697947 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.3323478338046426 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.31975345327634014 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.3207400992620562 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1456, + "tasks": [], + "average_score": 0.39680785337230745 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.38069986029874947 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.21448412698412703 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.34991843422677277 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.36487656334089386 + }, + "video": { + "count": 43, + "num_samples": 700, + "tasks": [], + "average_score": 0.3555910609857979 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.23950364354876252 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.31886513111201115 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.3972495309304478 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.18098305857595157 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.30887234822244314 + }, + "Perception": { + "count": 145, + "num_samples": 2315, + "tasks": [], + "average_score": 0.39256038521661607 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.44924313486983725 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.2880278656037017 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.4015531477048036 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.24179792538224956 + } + } + } +} \ No newline at end of file diff --git a/static/eval_results/Default/Qwen2_VL_7B/task_results.json b/static/eval_results/Default/Qwen2_VL_7B/task_results.json new file mode 100644 index 0000000000000000000000000000000000000000..48d910a2b4b14cfe26a2d195778a1e9ea647c0ac --- /dev/null +++ b/static/eval_results/Default/Qwen2_VL_7B/task_results.json @@ -0,0 +1,7756 @@ +[ + { + "name": "face_identity_matching", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "4-5 images" + }, + { + "name": "IAM_line_ocr_and_locate", + "score": 0.3953838788800739, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "science_molecule_chemistry", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "weather_info_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "signboard_identification", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "av_multicamera_tracking_predict_bbox", + "score": 0.026547987444069228, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "9-image or more" + }, + { + "name": "funsd_document_qa", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "physical_property_reasoning", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "geometry_area", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geometry_analytic", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "map_diagram_qa", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "clevr_arithmetic", + "score": 0.5263157894736842, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "graph_connectivity", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "visualdial_visual_dialog_image_guessing", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "graph_isomorphism", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "clevrer_object_existence_video", + "score": 0.8125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "question_solution_solving", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "snli_ve_visual_entailment", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_chemistry", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ili_ratio_future_prediction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "image_style_recognition", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "multilingual_movie_info_parsing", + "score": 0.6224489795918366, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "star_object_interaction_video", + "score": 0.125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "video_content_reasoning", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "geometry_transformation", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "places365_similar_scene_retrieval", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "vibe_eval_short_phrase", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "cam_traj_to_video_selection", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "math_breakpoint", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "photo_sharing_image_retrieval", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "perception_test_video_character_order", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "coco_ood_global_image_retrieval_by_query_property", + "score": 0.5370954442383014, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "arc_agi", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "vizwiz_quality_accessment_for_blind", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "av_human_multiview_counting", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "6-8 images" + }, + { + "name": "waybill_number_sequence_extraction", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "emotion_recognition", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "6-8 images" + }, + { + "name": "brand_logo_recognition_and_elaboration", + "score": 0.64, + "eval_type": "rule", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "license_plate_recognition", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chess_puzzle_single_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_programming_test_easy", + "score": 0.20833333333333334, + "eval_type": "rule", + "num_demo": 1, + "num_query": 24, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_winner_identification", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chart_vqa", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "math_convexity_value_estimation", + "score": 0.37716272271985773, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "3d_indoor_scene_text_bbox_prediction", + "score": 0.12041002821230626, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "long_string_letter_recognition", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "dvqa", + "score": 0.8947368421052632, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "egocentric_spatial_reasoning", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "music_info_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "quizlet_question_solving", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "coco_person_detection", + "score": 0.7728401943343881, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "clevrer_moving_direction_video", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "graph_maxflow", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "game_info_parsing", + "score": 0.8766233766233765, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_hamiltonian_cycle", + "score": 0.33035714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "av_vehicle_multiview_counting", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "6-8 images" + }, + { + "name": "animal_pose_estimation", + "score": 0.19897365891504984, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "humor_understand_caption_match", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "super_clevr", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "average_humidity_estimate_plot", + "score": 0.6093333333333334, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "nlvr2_two_image_compare_qa", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "sta_action_localization_video", + "score": 0.1875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "face_keypoint_detection", + "score": 0.7543233207204482, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "insect_order_classification", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "deciphering_oracle_bone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "places365_scene_type_classification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "science_basic_physics", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "perception_test_object_shuffle_video", + "score": 0.0625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "long_string_number_recognition", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_programming_test_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "video_to_camera_trajectory_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "cheapest_flight_identification", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "newspaper_ocr_in_query_box", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "stock_info_retrieval", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "graph_theory", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "landmark_recognition_and_qa", + "score": 0.6444444444444443, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "muma_theory_of_mind_social_goal", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "clevrer_video_moving_object_property_recognition", + "score": 0.9375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "action_sequence", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "9-image or more" + }, + { + "name": "game_info_retrieval", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "av_view_identification", + "score": 0.12222222222222223, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "code_programming_test_advanced", + "score": 0.018518518518518517, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "muma_theory_of_mind_belief_of_goal", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "code_programming_extremely_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Planning and Decision Making" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "geometry_solid", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "top_rated_hotel_identification", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "stock_info_parsing", + "score": 0.9747899159663866, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "pmc_vqa_medical_image_qa", + "score": 0.47368421052631576, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "3d_fragments_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "electricity_future_prediction_from_table", + "score": 0.7726315789473684, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "TV_show_info_parsing", + "score": 0.746031746031746, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "coco_object_detection_by_query_property", + "score": 0.6253043759566965, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "vlnqa_egocentric_navigation_video", + "score": 0.1875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "landmark_check_two_images", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "figureqa", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_biology", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "traffic_future_prediction_from_line_plot", + "score": 0.5065263157894736, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "mvsa_sentiment_classification", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "stock_price_future_prediction", + "score": 0.6086428571428572, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multi_load_type_prediction_from_plot", + "score": 0.4523809523809523, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "hotel_booking_confirmation_parsing", + "score": 0.6285714285714287, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_shortest_path_planar", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "action_prediction", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "code_error_line_identification", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "math_parity", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "famous_building_recognition", + "score": 0.78125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "multilingual_game_info_parsing", + "score": 0.7410714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "weather_info_parsing", + "score": 0.9722222222222222, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_shortest_path_kamada_kawai", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "widerface_face_count_and_event_classification", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "nextqa_mc", + "score": 0.8421052631578947, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "ti_fused_vqa_physics", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "movie_info_retrieval", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "graph_chordless_cycle", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "clevrer_video_moving_object_count", + "score": 0.47619047619047616, + "eval_type": "rule", + "num_demo": 1, + "num_query": 21, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "video" + }, + { + "name": "geometry_length", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "algebra", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "human_relationship_reasoning", + "score": 0.875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Commonsense and Social Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "exchange_rate_estimate_plot", + "score": 0.9400428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "electricity_load_estimate_plot", + "score": 0.45385714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "logo2k_same_type_logo_retrieval", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "single_person_pose_estimation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_math", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "electricity_plot_future_prediction", + "score": 0.6399368421052632, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "graph_hamiltonian_path", + "score": 0.3928571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "iconqa", + "score": 0.3684210526315789, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "movie_info_parsing", + "score": 0.7232142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "youtube_video_info_parsing", + "score": 0.761904761904762, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "egocentric_analysis_single_image", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "perception_test_video_action_count", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "video" + }, + { + "name": "music_info_parsing", + "score": 0.5982142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "mnist_pattern", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "newspaper_page_parse_and_count", + "score": 0.4666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_descriptive", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_relation", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_depth", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_distance", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_count", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geographic_remote_sensing_land_cover", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "symbolic_graphics_programs_computer_aided_design", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "symbolic_graphics_programs_scalable_vector_graphics", + "score": 0.1111111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_asia", + "score": 0.05714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_americas", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_europe", + "score": 0.05714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_africa", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "visual_prediction_rater_plane_segmentation", + "score": 0.08888888888888888, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_novel_view_synthesis", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_3d_assembled_quality_understanding", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "visual_prediction_rater_semantic_segmentation", + "score": 0.0625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_panoptic_segmentation", + "score": 0.023809523809523808, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_surface_normal_estimation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_depth_estimation", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_openable_part_segmentation", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_correspondance_in_two_images", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "google_streetview_circle_reasoning", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "vln_tegulu_next_step", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "flowchart_code_generation", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "video_eval_factual_pref", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "medical_polyp_segmentation_single_object_rater", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "recipe_image_ordering", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "healthcare_info_judgement", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Ethical and Safety Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "relative_depth_of_different_points", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "planning_visual_storage", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_visualization_output_understanding", + "score": 0.3, + "eval_type": "rule", + "num_demo": 1, + "num_query": 10, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "Ad_count_detection", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "TRANCE_physics_reasoning_view", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "video_grounding_temporal", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "chess_sygyzy_endgames", + "score": 0.06598639455782314, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_visual_blocksworld", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "web_action_grounding", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "position_relationship", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Web_Multi", + "score": 0.35939538929969556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "webpage_code_understanding", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "vln_english_next_step", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "logical_reasoning_2d_folding", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "extract_webpage_headline", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_translation_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "music_sheet_note_count", + "score": 0.058823529411764705, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_circled_letter", + "score": 0.32142857142857145, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_find_odd_one_out", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "paper_vqa", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "worldle", + "score": 0.2838251878691227, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "medical_keywords_based_retrieval_non_radiology", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "remaining_playback_time_calculation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "sign_language", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "video_camera_motion_description", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "exact_text", + "num_input": "video" + }, + { + "name": "video_segments_reordering", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "structured_output", + "num_input": "video" + }, + { + "name": "interpret_force_perspective_illusion", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Web_Single", + "score": 0.032670025538873985, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "hashtag_recommendation", + "score": 0.8869047619047619, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_tyreworld", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_grounding_spatial", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "weather_map_climate_type_temperature_parsing", + "score": 0.6071428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_match_problem", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "music_sheet_sentiment", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "scibench_calculus_wo_solution", + "score": 0.02040816326530612, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_eval_visual_pref", + "score": 0.5625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "image_translation_en2cn", + "score": 0.2756576749919229, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "game_platform_support_identification", + "score": 0.8214285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "music_sheet_format_QA", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "CLEVRER_physics", + "score": 0.45, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "location_vqa", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "semantic_matching_of_two_images", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "planning_visual_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "geometry_reasoning_nested_squares", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "circuit_diagram_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "scibench_fundamental_wo_solution", + "score": 0.02040816326530612, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_visual_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "planning_visual_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "code_retrieval", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "planning_screenshot_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_blocksworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_storage", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "distinguish_ai_generated_image", + "score": 0.7894736842105263, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_count_line_intersections", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_grippers", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "booking_web_recommendation", + "score": 0.36434240362811793, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "photoshop_operation", + "score": 0.17261904761904762, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "llavaguard", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "media_homepage_profile", + "score": 0.13337585034013605, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "vln_hindi_next_step", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "media_recommend_solutions_stackoverflow", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "dish_ingredient_match", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "google_streetview_direction_understanding", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "music_sheet_name", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "code_translation_easy", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "ancient_map_understanding", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "mindmap_elements_parsing", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "autonomous_driving_scene_analysis", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_2D_views_of_3D_shapes", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "paper_review_acceptance", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "rocks_samples_compare", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "google_streetview_line_reasoning", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "orchestra_score_recognition", + "score": 0.10714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "highest_discount_game_price_identification", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "video_intent_recognition", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "counting", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "media_QA_web_stackoverflow", + "score": 0.6904761904761905, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_solution_compare", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "soccer_offside", + "score": 0.1111111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "2d_image_jigsaw_puzzle_easy", + "score": 0.1845238095238095, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "medical_content_based_retrieval_radiology", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "song_title_identification_from_lyrics", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MMMU_pro_exam_screenshot", + "score": 0.25252525252525254, + "eval_type": "rule", + "num_demo": 1, + "num_query": 99, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "multiview_reasoning_camera_moving", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "medical_multi_organ_segmentation_rater", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "knowledge_graph_understanding", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "rocks_samples_identify", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "mensa_iq_test", + "score": 0.39166666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "play_go_capture_stone", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "booking_web_rating", + "score": 0.8928571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "web_action_prediction", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "top_video_creator_identification", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_visual_grippers", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "annoying_word_search", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Mobile_tap", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "comic_page_ordering", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "geometry_reasoning_grid", + "score": 0.17857142857142858, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "TRANCE_physics_reasoning_event", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "planning_screenshot_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Mobile_swipe", + "score": 0.11383541415414918, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "recover_masked_word_in_figure", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "vln_identify_robot", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "9-image or more" + }, + { + "name": "multilingual_news_qa", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "move_pos_to_pos_hanoi_4_pole", + "score": 0.0503968253968254, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "relative_reflectance_of_different_regions", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "counting_multi_image", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "code_translation_advanced", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "signage_navigation", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "transit_map_intersection_points", + "score": 0.2261904761904762, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_execution", + "score": 0.1875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "icon_arithmetic_puzzle", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_overlapped_circle", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "google_streetview_circle_sorting", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "chinese_idiom_recognition", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "knowledge_sign_recognition", + "score": 0.1111111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "monthly_weather_days_count", + "score": 0.3095238095238096, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "calendar_schedule_suggestion", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "music_sheet_author", + "score": 0.3125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "ascii_art_understanding", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "chess_find_legal_moves", + "score": 0.03296776632380673, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "topological_sort", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "entertainment_web_game_style", + "score": 0.8214285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_action_recognition", + "score": 0.8214285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "structured_output", + "num_input": "video" + }, + { + "name": "functionality_matching_in_different_objects", + "score": 0.32142857142857145, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "Movie_retrieval_by_actor", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "TV_show_retrieval_by_character", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "Forensic_Detection_of_different_images", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "polygon_interior_angles", + "score": 0.0066666666666666775, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "pokemon_3D_recognition", + "score": 0.35, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "mahjong", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "font_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "paper_review_rating", + "score": 0.7142674593015049, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "numerical_data", + "num_input": "4-5 images" + }, + { + "name": "number_comparison", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "actor_recognition_in_Movie", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "TRANCE_physics_reasoning_basic", + "score": 0.35294117647058826, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "waldo", + "score": 0.000811738675187593, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "google_streetview_line_sorting", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "video_eval_dynamic_pref", + "score": 0.4375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "product_ocr_qa", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "Bongard_Problem", + "score": 0.21052631578947367, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_translation_Python", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "logical_reasoning_fit_pattern", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "road_map_find_highway_between_two_place", + "score": 0.4117647058823529, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ishihara_test", + "score": 0.3714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_parasite_detection", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "LaTeX_complex_formula_convertion", + "score": 0.23529411764705882, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "vln_identify_location", + "score": 0.08484848484848485, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "rebus", + "score": 0.08695652173913043, + "eval_type": "rule", + "num_demo": 1, + "num_query": 23, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "constrained_generation_multi_contain_repeat_position_only_length", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_multi_contain_repeat", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_position_length", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_xor_images", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_position_images", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_repeat_length", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_contain_length", + "score": 1.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_multi_contain_position_only", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_contain_images", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_length", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_equality", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_crushing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_checkmate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "panel_images_single_question", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "panel_images_multi_question", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_tiktok", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_notes", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_alipay", + "score": 0.23529411764705882, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_amazon", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_instagram", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_ppt", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_leetcode", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_twitter", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_zoom", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_iphone_settings", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_youtube", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_excel", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_word", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "crossword_mini_5x5", + "score": 0.07857142857142858, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MMSoc_HatefulMemes", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_GossipCop", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Object Recognition and Classification", + "Scene and Event Understanding", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_PolitiFact", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Memotion", + "score": 0.6352941176470588, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "reward_models_I2T_reward", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "reward_models_T2I_reward", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "ocr_math_MATH", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "ocr_math_TheoremQA", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "counterfactual_arithmetic", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_homepage", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_publication", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_blogpost", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "table_understanding_complex_question_answering", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "table_understanding_fact_verification", + "score": 0.6904761904761906, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_swap", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_style", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_clip_stable_diffusion_generate", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_veracity", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_attribute_edit", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_entity_replace", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_out_of_context", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_background_change", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "video_motion_matching_3D_real", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "video_motion_matching_real_3D", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "number_puzzle_kakuro_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "number_puzzle_sudoku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ball_cup_swap_3", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "maze_2d_8x8", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "shape_composition_shapes", + "score": 0.2787414965986395, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "shape_composition_colours", + "score": 0.2790532879818594, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_chinese_celebrity", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_indian_celebrity", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_papers", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_famous_treaty", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_csv", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_school_plain", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_text_latex", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_skill_plain", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_experience_plain", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_article_authors", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_markdown", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_html", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_employer_plain", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_equation", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_article_journal", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_latex", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "autorater_subject", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_artifact", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "autorater_control", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_aesthetics", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_mask", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_artifact_reason", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "autorater_semantics", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_motion_guided_editing", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "autorater_3d_model_texturing", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Metrics", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_unmask", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "poetry_haiku", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_limerick", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_petrarchian_sonnet_optional_meter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_custom_rhyming_scheme", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic_alliteration", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_shakespearean_sonnet", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "pictionary_cartoon_drawing_guess", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_doodle_guess", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_skribbl_io", + "score": 0.05, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_genai_output_chinese", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_chinese_food_img2en", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_tiktok", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_alipay", + "score": 0.058823529411764705, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_amazon", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_instagram", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_ppt", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_leetcode", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_twitter", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_zoom", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_iphone_settings", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_youtube", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_excel", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_word", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "medical_abdomen_MRI_organ_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "medical_abdomen_endscopy_organ_recognition", + "score": 0.047619047619047616, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "medical_blood_vessels_recognition", + "score": 0.6071428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_cell_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "medical_retrieval_given_surgeon_activity", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Videos", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "medical_image_artifacts_indentification", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "medical_counting_lymphocytes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "character_recognition_in_TV_shows", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "cultural_vqa", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "tqa_textbook_qa", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_output_result", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "kvqa_knowledge_aware_qa", + "score": 0.3684210526315789, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "painting_QA", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "MMMU_physics_chemistry_MCQ", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "realworld_qa_en2cn", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "arxiv_vqa", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "code_add_tag", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "3d_indoor_scene_text_bbox_selection", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "table_understanding", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "handwritten_math_expression_extraction", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_notes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ascii_art_30", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "docci_image_description_long", + "score": 0.6785714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bar_chart_interpretation", + "score": 0.4310344827586208, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funqa_unexpected_action_magic_video", + "score": 0.37999999999999995, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "graph_interpretation", + "score": 0.5620689655172414, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "guess_image_generation_prompt", + "score": 0.7947368421052632, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "image_humor_understanding", + "score": 0.5793103448275861, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "science_figure_explanation", + "score": 0.506896551724138, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_detail_description", + "score": 0.4368421052631579, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "humor_explanation", + "score": 0.2666666666666667, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video2notes", + "score": 0.43571428571428567, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "activitynetqa", + "score": 0.445, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "figurative_speech_explanation", + "score": 0.6, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "meme_explain", + "score": 0.6857142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_qa", + "score": 0.7142857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "art_explanation", + "score": 0.3724137931034482, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "unusual_images", + "score": 0.4206896551724137, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "image_captioning_with_additional_requirements", + "score": 0.8285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "tweets_captioning", + "score": 0.5499999999999999, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "wikihow_complex_task_completion", + "score": 0.33333333333333326, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "9-image or more" + }, + { + "name": "defeasible_reasoning", + "score": 0.5517241379310345, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "electrocardiogram", + "score": 0.19285714285714287, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "doc_vqa", + "score": 0.7250000000000001, + "eval_type": "llm", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "4-5 images" + }, + { + "name": "funny_image_title", + "score": 0.6642857142857144, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "GUI_Chat_Easy", + "score": 0.6500000000000001, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "iq_test", + "score": 0.3068965517241378, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "scibench_w_solution_open_ended", + "score": 0.12999999999999998, + "eval_type": "llm", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funqa_unexpected_action_creative_video", + "score": 0.3333333333333333, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "video_summary", + "score": 0.5499999999999999, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "visualization_with_code", + "score": 0.29285714285714287, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocrqa", + "score": 0.6931034482758621, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funqa_unexpected_action_humor_video", + "score": 0.28, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "paper_review_writing", + "score": 0.15333333333333335, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "4-5 images" + }, + { + "name": "GUI_Chat_Hard", + "score": 0.4548387096774194, + "eval_type": "llm", + "num_demo": 1, + "num_query": 31, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "traffic_accident_analysis", + "score": 0.3714285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "vibe-eval", + "score": 0.5571428571428572, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "generated_video_artifacts", + "score": 0.31249999999999994, + "eval_type": "llm", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "table2latex_complex", + "score": 0.5333333333333333, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_short_title", + "score": 0.6357142857142856, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "sceneqa_scene_transition_video", + "score": 0.23750000000000002, + "eval_type": "llm", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "nextqa_oe", + "score": 0.2899999999999999, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "video_content_follow_up", + "score": 0.4428571428571429, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Planning and Decision Making" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "red_teaming_celebrity", + "score": 0.7449999999999999, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visual_order_B", + "score": 0.865, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "red_teaming_visualmisleading", + "score": 0.905263157894737, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_racial", + "score": 0.7449999999999999, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visual_order_A", + "score": 0.8850000000000001, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "red_teaming_politics", + "score": 0.6399999999999999, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_captcha", + "score": 0.10000000000000002, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_jailbreak", + "score": 0.255, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Text Recognition (OCR)", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Russian", + "score": 0.2642857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_French", + "score": 0.35000000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Spanish", + "score": 0.41428571428571426, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_English", + "score": 0.4071428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Japanese", + "score": 0.35000000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Arabic", + "score": 0.22142857142857145, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Russian", + "score": 0.1357142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_French", + "score": 0.22857142857142856, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Spanish", + "score": 0.1642857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Chinese", + "score": 0.2214285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Arabic", + "score": 0.1, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "table_understanding_fetaqa", + "score": 0.6071428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_worldclass", + "score": 0.24285714285714288, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_expert", + "score": 0.2071428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_advanced", + "score": 0.13571428571428573, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + } +] \ No newline at end of file diff --git a/static/eval_results/Default/all_model_keywords_stats.json b/static/eval_results/Default/all_model_keywords_stats.json deleted file mode 100644 index 0fd965fa68464b34b7227d2a08b3cd074a9bb74f..0000000000000000000000000000000000000000 --- a/static/eval_results/Default/all_model_keywords_stats.json +++ /dev/null @@ -1,5384 +0,0 @@ -{ - "GPT_4o": { - "skills": { - "Object Recognition and Classification": { - "count": 303, - "num_samples": 4755, - "tasks": [], - "average_score": 0.5630758211022604 - }, - "Text Recognition (OCR)": { - "count": 137, - "num_samples": 2239, - "tasks": [], - "average_score": 0.6216411634729735 - }, - "Language Understanding and Generation": { - "count": 154, - "num_samples": 2509, - "tasks": [], - "average_score": 0.616018277142757 - }, - "Scene and Event Understanding": { - "count": 154, - "num_samples": 2467, - "tasks": [], - "average_score": 0.5823101249498799 - }, - "Mathematical and Logical Reasoning": { - "count": 109, - "num_samples": 1910, - "tasks": [], - "average_score": 0.44177544539510955 - }, - "Commonsense and Social Reasoning": { - "count": 51, - "num_samples": 855, - "tasks": [], - "average_score": 0.6345458069232931 - }, - "Ethical and Safety Reasoning": { - "count": 15, - "num_samples": 245, - "tasks": [], - "average_score": 0.6795263157894738 - }, - "Domain-Specific Knowledge and Skills": { - "count": 77, - "num_samples": 1386, - "tasks": [], - "average_score": 0.5514924675940659 - }, - "Spatial and Temporal Reasoning": { - "count": 152, - "num_samples": 2437, - "tasks": [], - "average_score": 0.39435038953269674 - }, - "Planning and Decision Making": { - "count": 37, - "num_samples": 577, - "tasks": [], - "average_score": 0.22934807257231926 - } - }, - "input_format": { - "User Interface Screenshots": { - "count": 93, - "num_samples": 1517, - "tasks": [], - "average_score": 0.608083455060831 - }, - "Text-Based Images and Documents": { - "count": 82, - "num_samples": 1294, - "tasks": [], - "average_score": 0.491325251564869 - }, - "Diagrams and Data Visualizations": { - "count": 101, - "num_samples": 1718, - "tasks": [], - "average_score": 0.4999089647103332 - }, - "Videos": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.5315979872161023 - }, - "Artistic and Creative Content": { - "count": 32, - "num_samples": 541, - "tasks": [], - "average_score": 0.5641404607063637 - }, - "Photographs": { - "count": 143, - "num_samples": 2248, - "tasks": [], - "average_score": 0.5613545677222056 - }, - "3D Models and Aerial Imagery": { - "count": 11, - "num_samples": 169, - "tasks": [], - "average_score": 0.47760591698367955 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 98, - "num_samples": 1514, - "tasks": [], - "average_score": 0.5388690453811203 - }, - "structured_output": { - "count": 110, - "num_samples": 1714, - "tasks": [], - "average_score": 0.48037685656449847 - }, - "exact_text": { - "count": 83, - "num_samples": 1278, - "tasks": [], - "average_score": 0.5994159671881645 - }, - "numerical_data": { - "count": 49, - "num_samples": 862, - "tasks": [], - "average_score": 0.44606605087301393 - }, - "open_ended_output": { - "count": 80, - "num_samples": 1454, - "tasks": [], - "average_score": 0.6274371950293718 - }, - "multiple_choice": { - "count": 85, - "num_samples": 1363, - "tasks": [], - "average_score": 0.5448877153826162 - } - }, - "input_num": { - "6-8 images": { - "count": 21, - "num_samples": 314, - "tasks": [], - "average_score": 0.4751133786848073 - }, - "9-image or more": { - "count": 41, - "num_samples": 623, - "tasks": [], - "average_score": 0.5343350103400748 - }, - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.5672657028463585 - }, - "video": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.5315979872161023 - }, - "4-5 images": { - "count": 34, - "num_samples": 520, - "tasks": [], - "average_score": 0.4500928191484624 - }, - "2-3 images": { - "count": 51, - "num_samples": 802, - "tasks": [], - "average_score": 0.4908653289106883 - } - }, - "app": { - "Information_Extraction": { - "count": 72, - "num_samples": 1124, - "tasks": [], - "average_score": 0.7056027785545881 - }, - "Planning": { - "count": 78, - "num_samples": 1239, - "tasks": [], - "average_score": 0.33202130899313653 - }, - "Coding": { - "count": 31, - "num_samples": 474, - "tasks": [], - "average_score": 0.5032849161169843 - }, - "Perception": { - "count": 145, - "num_samples": 2313, - "tasks": [], - "average_score": 0.5510350848991218 - }, - "Metrics": { - "count": 20, - "num_samples": 309, - "tasks": [], - "average_score": 0.6095778863474799 - }, - "Science": { - "count": 29, - "num_samples": 574, - "tasks": [], - "average_score": 0.5283797185155754 - }, - "Knowledge": { - "count": 97, - "num_samples": 1605, - "tasks": [], - "average_score": 0.6135723164021851 - }, - "Mathematics": { - "count": 33, - "num_samples": 547, - "tasks": [], - "average_score": 0.44047720383044436 - } - } - }, - "Gemini_1.5_pro_002": { - "skills": { - "Object Recognition and Classification": { - "count": 303, - "num_samples": 4755, - "tasks": [], - "average_score": 0.5202055934299538 - }, - "Text Recognition (OCR)": { - "count": 137, - "num_samples": 2239, - "tasks": [], - "average_score": 0.5017043129027509 - }, - "Language Understanding and Generation": { - "count": 154, - "num_samples": 2509, - "tasks": [], - "average_score": 0.5532599716027446 - }, - "Scene and Event Understanding": { - "count": 154, - "num_samples": 2467, - "tasks": [], - "average_score": 0.546753787203128 - }, - "Mathematical and Logical Reasoning": { - "count": 109, - "num_samples": 1910, - "tasks": [], - "average_score": 0.425969084163906 - }, - "Commonsense and Social Reasoning": { - "count": 51, - "num_samples": 855, - "tasks": [], - "average_score": 0.5751012914154264 - }, - "Ethical and Safety Reasoning": { - "count": 15, - "num_samples": 245, - "tasks": [], - "average_score": 0.6982330827067671 - }, - "Domain-Specific Knowledge and Skills": { - "count": 77, - "num_samples": 1386, - "tasks": [], - "average_score": 0.513647745999633 - }, - "Spatial and Temporal Reasoning": { - "count": 152, - "num_samples": 2437, - "tasks": [], - "average_score": 0.3845337030093212 - }, - "Planning and Decision Making": { - "count": 37, - "num_samples": 577, - "tasks": [], - "average_score": 0.23899503258223884 - } - }, - "input_format": { - "User Interface Screenshots": { - "count": 93, - "num_samples": 1517, - "tasks": [], - "average_score": 0.4625032188638111 - }, - "Text-Based Images and Documents": { - "count": 82, - "num_samples": 1294, - "tasks": [], - "average_score": 0.4292353723689881 - }, - "Diagrams and Data Visualizations": { - "count": 101, - "num_samples": 1718, - "tasks": [], - "average_score": 0.4869625906903554 - }, - "Videos": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.5028718355967439 - }, - "Artistic and Creative Content": { - "count": 32, - "num_samples": 541, - "tasks": [], - "average_score": 0.5584779204331461 - }, - "Photographs": { - "count": 143, - "num_samples": 2248, - "tasks": [], - "average_score": 0.55005349042813 - }, - "3D Models and Aerial Imagery": { - "count": 11, - "num_samples": 169, - "tasks": [], - "average_score": 0.4292127751495457 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 98, - "num_samples": 1514, - "tasks": [], - "average_score": 0.44896309957892694 - }, - "structured_output": { - "count": 110, - "num_samples": 1714, - "tasks": [], - "average_score": 0.44418591808616864 - }, - "exact_text": { - "count": 83, - "num_samples": 1278, - "tasks": [], - "average_score": 0.5146447350354234 - }, - "numerical_data": { - "count": 49, - "num_samples": 862, - "tasks": [], - "average_score": 0.4688623462674191 - }, - "open_ended_output": { - "count": 80, - "num_samples": 1454, - "tasks": [], - "average_score": 0.5580414823700747 - }, - "multiple_choice": { - "count": 85, - "num_samples": 1363, - "tasks": [], - "average_score": 0.5538255562099124 - } - }, - "input_num": { - "6-8 images": { - "count": 21, - "num_samples": 314, - "tasks": [], - "average_score": 0.39066515495086923 - }, - "9-image or more": { - "count": 41, - "num_samples": 623, - "tasks": [], - "average_score": 0.5370278962809547 - }, - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.5034399620483027 - }, - "video": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.5028718355967439 - }, - "4-5 images": { - "count": 34, - "num_samples": 520, - "tasks": [], - "average_score": 0.4885398161821004 - }, - "2-3 images": { - "count": 51, - "num_samples": 802, - "tasks": [], - "average_score": 0.45544217378728585 - } - }, - "app": { - "Information_Extraction": { - "count": 72, - "num_samples": 1124, - "tasks": [], - "average_score": 0.5421439953094952 - }, - "Planning": { - "count": 78, - "num_samples": 1239, - "tasks": [], - "average_score": 0.3335324339429373 - }, - "Coding": { - "count": 31, - "num_samples": 474, - "tasks": [], - "average_score": 0.43465181771633377 - }, - "Perception": { - "count": 145, - "num_samples": 2313, - "tasks": [], - "average_score": 0.5250631828331306 - }, - "Metrics": { - "count": 20, - "num_samples": 309, - "tasks": [], - "average_score": 0.5821004797173627 - }, - "Science": { - "count": 29, - "num_samples": 574, - "tasks": [], - "average_score": 0.5124355410095621 - }, - "Knowledge": { - "count": 97, - "num_samples": 1605, - "tasks": [], - "average_score": 0.5722329455291694 - }, - "Mathematics": { - "count": 33, - "num_samples": 547, - "tasks": [], - "average_score": 0.41210885517904977 - } - } - }, - "Gemini_1.5_flash_002": { - "skills": { - "Object Recognition and Classification": { - "count": 303, - "num_samples": 4755, - "tasks": [], - "average_score": 0.46250942866818673 - }, - "Text Recognition (OCR)": { - "count": 137, - "num_samples": 2239, - "tasks": [], - "average_score": 0.4337278553354258 - }, - "Language Understanding and Generation": { - "count": 154, - "num_samples": 2509, - "tasks": [], - "average_score": 0.49947464681475356 - }, - "Scene and Event Understanding": { - "count": 154, - "num_samples": 2467, - "tasks": [], - "average_score": 0.5098686082319499 - }, - "Mathematical and Logical Reasoning": { - "count": 109, - "num_samples": 1910, - "tasks": [], - "average_score": 0.34393279682972117 - }, - "Commonsense and Social Reasoning": { - "count": 51, - "num_samples": 855, - "tasks": [], - "average_score": 0.5594391803821158 - }, - "Ethical and Safety Reasoning": { - "count": 15, - "num_samples": 245, - "tasks": [], - "average_score": 0.6380250626566416 - }, - "Domain-Specific Knowledge and Skills": { - "count": 77, - "num_samples": 1386, - "tasks": [], - "average_score": 0.44816564352475535 - }, - "Spatial and Temporal Reasoning": { - "count": 152, - "num_samples": 2437, - "tasks": [], - "average_score": 0.34510790215980036 - }, - "Planning and Decision Making": { - "count": 37, - "num_samples": 577, - "tasks": [], - "average_score": 0.18973764406890803 - } - }, - "input_format": { - "User Interface Screenshots": { - "count": 93, - "num_samples": 1517, - "tasks": [], - "average_score": 0.3865262916591035 - }, - "Text-Based Images and Documents": { - "count": 82, - "num_samples": 1294, - "tasks": [], - "average_score": 0.3598139859097534 - }, - "Diagrams and Data Visualizations": { - "count": 101, - "num_samples": 1718, - "tasks": [], - "average_score": 0.4013870708864889 - }, - "Videos": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.4903530871753026 - }, - "Artistic and Creative Content": { - "count": 32, - "num_samples": 541, - "tasks": [], - "average_score": 0.5051202896842343 - }, - "Photographs": { - "count": 143, - "num_samples": 2248, - "tasks": [], - "average_score": 0.5166044655846657 - }, - "3D Models and Aerial Imagery": { - "count": 11, - "num_samples": 169, - "tasks": [], - "average_score": 0.3849084036535956 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 98, - "num_samples": 1514, - "tasks": [], - "average_score": 0.3869438864407766 - }, - "structured_output": { - "count": 110, - "num_samples": 1714, - "tasks": [], - "average_score": 0.39868324168390534 - }, - "exact_text": { - "count": 83, - "num_samples": 1278, - "tasks": [], - "average_score": 0.44793686445264996 - }, - "numerical_data": { - "count": 49, - "num_samples": 862, - "tasks": [], - "average_score": 0.3704146726364947 - }, - "open_ended_output": { - "count": 80, - "num_samples": 1454, - "tasks": [], - "average_score": 0.5448638967636353 - }, - "multiple_choice": { - "count": 85, - "num_samples": 1363, - "tasks": [], - "average_score": 0.47829883834573317 - } - }, - "input_num": { - "6-8 images": { - "count": 21, - "num_samples": 314, - "tasks": [], - "average_score": 0.33669690098261523 - }, - "9-image or more": { - "count": 41, - "num_samples": 623, - "tasks": [], - "average_score": 0.43653808057103954 - }, - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.4427944359714585 - }, - "video": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.4903530871753026 - }, - "4-5 images": { - "count": 34, - "num_samples": 520, - "tasks": [], - "average_score": 0.42346517633403413 - }, - "2-3 images": { - "count": 51, - "num_samples": 802, - "tasks": [], - "average_score": 0.41994719346489817 - } - }, - "app": { - "Information_Extraction": { - "count": 72, - "num_samples": 1124, - "tasks": [], - "average_score": 0.46645473820179373 - }, - "Planning": { - "count": 78, - "num_samples": 1239, - "tasks": [], - "average_score": 0.2517485212411566 - }, - "Coding": { - "count": 31, - "num_samples": 474, - "tasks": [], - "average_score": 0.40372378342017806 - }, - "Perception": { - "count": 145, - "num_samples": 2313, - "tasks": [], - "average_score": 0.4799408254775632 - }, - "Metrics": { - "count": 20, - "num_samples": 309, - "tasks": [], - "average_score": 0.6010361821632402 - }, - "Science": { - "count": 29, - "num_samples": 574, - "tasks": [], - "average_score": 0.4569546533897065 - }, - "Knowledge": { - "count": 97, - "num_samples": 1605, - "tasks": [], - "average_score": 0.511590428993871 - }, - "Mathematics": { - "count": 33, - "num_samples": 547, - "tasks": [], - "average_score": 0.33710867194177685 - } - } - }, - "Claude_3.5": { - "skills": { - "Object Recognition and Classification": { - "count": 303, - "num_samples": 4755, - "tasks": [], - "average_score": 0.5405089647404562 - }, - "Text Recognition (OCR)": { - "count": 137, - "num_samples": 2239, - "tasks": [], - "average_score": 0.6082834220752651 - }, - "Language Understanding and Generation": { - "count": 154, - "num_samples": 2509, - "tasks": [], - "average_score": 0.5745077617490254 - }, - "Scene and Event Understanding": { - "count": 154, - "num_samples": 2467, - "tasks": [], - "average_score": 0.5450038475783499 - }, - "Mathematical and Logical Reasoning": { - "count": 109, - "num_samples": 1910, - "tasks": [], - "average_score": 0.4767692987630454 - }, - "Commonsense and Social Reasoning": { - "count": 51, - "num_samples": 855, - "tasks": [], - "average_score": 0.5756126284078804 - }, - "Ethical and Safety Reasoning": { - "count": 15, - "num_samples": 245, - "tasks": [], - "average_score": 0.6969774436090224 - }, - "Domain-Specific Knowledge and Skills": { - "count": 77, - "num_samples": 1386, - "tasks": [], - "average_score": 0.5278843049497918 - }, - "Spatial and Temporal Reasoning": { - "count": 152, - "num_samples": 2437, - "tasks": [], - "average_score": 0.4082144793870471 - }, - "Planning and Decision Making": { - "count": 37, - "num_samples": 577, - "tasks": [], - "average_score": 0.23803578664609892 - } - }, - "input_format": { - "User Interface Screenshots": { - "count": 93, - "num_samples": 1517, - "tasks": [], - "average_score": 0.5691641481808987 - }, - "Text-Based Images and Documents": { - "count": 82, - "num_samples": 1294, - "tasks": [], - "average_score": 0.4795267886975966 - }, - "Diagrams and Data Visualizations": { - "count": 101, - "num_samples": 1718, - "tasks": [], - "average_score": 0.525848282456283 - }, - "Videos": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.508735695828719 - }, - "Artistic and Creative Content": { - "count": 32, - "num_samples": 541, - "tasks": [], - "average_score": 0.5699094130430454 - }, - "Photographs": { - "count": 143, - "num_samples": 2248, - "tasks": [], - "average_score": 0.5096772701625744 - }, - "3D Models and Aerial Imagery": { - "count": 11, - "num_samples": 169, - "tasks": [], - "average_score": 0.4429640420975014 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 98, - "num_samples": 1514, - "tasks": [], - "average_score": 0.5066797418318023 - }, - "structured_output": { - "count": 110, - "num_samples": 1714, - "tasks": [], - "average_score": 0.4971460788134188 - }, - "exact_text": { - "count": 83, - "num_samples": 1278, - "tasks": [], - "average_score": 0.5278127103234661 - }, - "numerical_data": { - "count": 49, - "num_samples": 862, - "tasks": [], - "average_score": 0.4490020843308984 - }, - "open_ended_output": { - "count": 80, - "num_samples": 1454, - "tasks": [], - "average_score": 0.5838224169821388 - }, - "multiple_choice": { - "count": 85, - "num_samples": 1363, - "tasks": [], - "average_score": 0.5456152399978661 - } - }, - "input_num": { - "6-8 images": { - "count": 21, - "num_samples": 314, - "tasks": [], - "average_score": 0.46300075585789874 - }, - "9-image or more": { - "count": 41, - "num_samples": 623, - "tasks": [], - "average_score": 0.5414381873407914 - }, - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.5373019912310933 - }, - "video": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.508735695828719 - }, - "4-5 images": { - "count": 34, - "num_samples": 520, - "tasks": [], - "average_score": 0.4422556748863689 - }, - "2-3 images": { - "count": 51, - "num_samples": 802, - "tasks": [], - "average_score": 0.49311554035078103 - } - }, - "app": { - "Information_Extraction": { - "count": 72, - "num_samples": 1124, - "tasks": [], - "average_score": 0.6663170946790707 - }, - "Planning": { - "count": 78, - "num_samples": 1239, - "tasks": [], - "average_score": 0.3382015835012861 - }, - "Coding": { - "count": 31, - "num_samples": 474, - "tasks": [], - "average_score": 0.5194010220575684 - }, - "Perception": { - "count": 145, - "num_samples": 2313, - "tasks": [], - "average_score": 0.532329797132399 - }, - "Metrics": { - "count": 20, - "num_samples": 309, - "tasks": [], - "average_score": 0.5808831682303479 - }, - "Science": { - "count": 29, - "num_samples": 574, - "tasks": [], - "average_score": 0.513474611293123 - }, - "Knowledge": { - "count": 97, - "num_samples": 1605, - "tasks": [], - "average_score": 0.5507075880782885 - }, - "Mathematics": { - "count": 33, - "num_samples": 547, - "tasks": [], - "average_score": 0.47461998432626556 - } - } - }, - "Claude_3.5_new": { - "skills": { - "Object Recognition and Classification": { - "count": 303, - "num_samples": 4755, - "tasks": [], - "average_score": 0.5690045172520449 - }, - "Text Recognition (OCR)": { - "count": 137, - "num_samples": 2239, - "tasks": [], - "average_score": 0.6220681231036606 - }, - "Language Understanding and Generation": { - "count": 154, - "num_samples": 2509, - "tasks": [], - "average_score": 0.6077980666415158 - }, - "Scene and Event Understanding": { - "count": 154, - "num_samples": 2467, - "tasks": [], - "average_score": 0.5511440615639541 - }, - "Mathematical and Logical Reasoning": { - "count": 109, - "num_samples": 1910, - "tasks": [], - "average_score": 0.4885536652013625 - }, - "Commonsense and Social Reasoning": { - "count": 51, - "num_samples": 855, - "tasks": [], - "average_score": 0.5908204006544897 - }, - "Ethical and Safety Reasoning": { - "count": 15, - "num_samples": 245, - "tasks": [], - "average_score": 0.6569473684210526 - }, - "Domain-Specific Knowledge and Skills": { - "count": 77, - "num_samples": 1386, - "tasks": [], - "average_score": 0.5486763511384175 - }, - "Spatial and Temporal Reasoning": { - "count": 152, - "num_samples": 2437, - "tasks": [], - "average_score": 0.4315385951907387 - }, - "Planning and Decision Making": { - "count": 37, - "num_samples": 577, - "tasks": [], - "average_score": 0.2909419331017877 - } - }, - "input_format": { - "User Interface Screenshots": { - "count": 93, - "num_samples": 1517, - "tasks": [], - "average_score": 0.6048192628845258 - }, - "Text-Based Images and Documents": { - "count": 82, - "num_samples": 1294, - "tasks": [], - "average_score": 0.48924295292319175 - }, - "Diagrams and Data Visualizations": { - "count": 101, - "num_samples": 1718, - "tasks": [], - "average_score": 0.556418710368288 - }, - "Videos": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.4946691340754988 - }, - "Artistic and Creative Content": { - "count": 32, - "num_samples": 541, - "tasks": [], - "average_score": 0.5558756390298104 - }, - "Photographs": { - "count": 143, - "num_samples": 2248, - "tasks": [], - "average_score": 0.5425198547046186 - }, - "3D Models and Aerial Imagery": { - "count": 11, - "num_samples": 169, - "tasks": [], - "average_score": 0.44210335381541843 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 98, - "num_samples": 1514, - "tasks": [], - "average_score": 0.5187252051932875 - }, - "structured_output": { - "count": 110, - "num_samples": 1714, - "tasks": [], - "average_score": 0.5071121107460066 - }, - "exact_text": { - "count": 83, - "num_samples": 1278, - "tasks": [], - "average_score": 0.5387340524651681 - }, - "numerical_data": { - "count": 49, - "num_samples": 862, - "tasks": [], - "average_score": 0.4824302644151348 - }, - "open_ended_output": { - "count": 80, - "num_samples": 1454, - "tasks": [], - "average_score": 0.6242798397166945 - }, - "multiple_choice": { - "count": 85, - "num_samples": 1363, - "tasks": [], - "average_score": 0.5782691045270721 - } - }, - "input_num": { - "6-8 images": { - "count": 21, - "num_samples": 314, - "tasks": [], - "average_score": 0.4630277507828528 - }, - "9-image or more": { - "count": 41, - "num_samples": 623, - "tasks": [], - "average_score": 0.5914338446093256 - }, - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.5636254729390459 - }, - "video": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.4946691340754988 - }, - "4-5 images": { - "count": 34, - "num_samples": 520, - "tasks": [], - "average_score": 0.4828123870640382 - }, - "2-3 images": { - "count": 51, - "num_samples": 802, - "tasks": [], - "average_score": 0.48756636014597515 - } - }, - "app": { - "Information_Extraction": { - "count": 72, - "num_samples": 1124, - "tasks": [], - "average_score": 0.6590137441693218 - }, - "Planning": { - "count": 78, - "num_samples": 1239, - "tasks": [], - "average_score": 0.39901670035164916 - }, - "Coding": { - "count": 31, - "num_samples": 474, - "tasks": [], - "average_score": 0.5166853031535193 - }, - "Perception": { - "count": 145, - "num_samples": 2313, - "tasks": [], - "average_score": 0.5561634744977417 - }, - "Metrics": { - "count": 20, - "num_samples": 309, - "tasks": [], - "average_score": 0.6123769274172342 - }, - "Science": { - "count": 29, - "num_samples": 574, - "tasks": [], - "average_score": 0.5512015158810595 - }, - "Knowledge": { - "count": 97, - "num_samples": 1605, - "tasks": [], - "average_score": 0.565796566886933 - }, - "Mathematics": { - "count": 33, - "num_samples": 547, - "tasks": [], - "average_score": 0.4763267502912362 - } - } - }, - "GPT_4o_mini": { - "skills": { - "Object Recognition and Classification": { - "count": 303, - "num_samples": 4755, - "tasks": [], - "average_score": 0.4492982787524939 - }, - "Text Recognition (OCR)": { - "count": 137, - "num_samples": 2239, - "tasks": [], - "average_score": 0.49026056071002017 - }, - "Language Understanding and Generation": { - "count": 154, - "num_samples": 2509, - "tasks": [], - "average_score": 0.5168957112681365 - }, - "Scene and Event Understanding": { - "count": 154, - "num_samples": 2467, - "tasks": [], - "average_score": 0.46731791428406805 - }, - "Mathematical and Logical Reasoning": { - "count": 109, - "num_samples": 1910, - "tasks": [], - "average_score": 0.3406008235342885 - }, - "Commonsense and Social Reasoning": { - "count": 51, - "num_samples": 855, - "tasks": [], - "average_score": 0.5572925295284307 - }, - "Ethical and Safety Reasoning": { - "count": 15, - "num_samples": 245, - "tasks": [], - "average_score": 0.6902380952380953 - }, - "Domain-Specific Knowledge and Skills": { - "count": 77, - "num_samples": 1386, - "tasks": [], - "average_score": 0.4189154010048976 - }, - "Spatial and Temporal Reasoning": { - "count": 152, - "num_samples": 2437, - "tasks": [], - "average_score": 0.2943206715105082 - }, - "Planning and Decision Making": { - "count": 37, - "num_samples": 577, - "tasks": [], - "average_score": 0.19422793560945503 - } - }, - "input_format": { - "User Interface Screenshots": { - "count": 93, - "num_samples": 1517, - "tasks": [], - "average_score": 0.47202628409684394 - }, - "Text-Based Images and Documents": { - "count": 82, - "num_samples": 1294, - "tasks": [], - "average_score": 0.3624496929166193 - }, - "Diagrams and Data Visualizations": { - "count": 101, - "num_samples": 1718, - "tasks": [], - "average_score": 0.38946844562183286 - }, - "Videos": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.45508480503584553 - }, - "Artistic and Creative Content": { - "count": 32, - "num_samples": 541, - "tasks": [], - "average_score": 0.47569921440672464 - }, - "Photographs": { - "count": 143, - "num_samples": 2248, - "tasks": [], - "average_score": 0.465175334092545 - }, - "3D Models and Aerial Imagery": { - "count": 11, - "num_samples": 169, - "tasks": [], - "average_score": 0.29410984789062117 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 98, - "num_samples": 1514, - "tasks": [], - "average_score": 0.41242028190533997 - }, - "structured_output": { - "count": 110, - "num_samples": 1714, - "tasks": [], - "average_score": 0.3906415365938764 - }, - "exact_text": { - "count": 83, - "num_samples": 1278, - "tasks": [], - "average_score": 0.44244772638735347 - }, - "numerical_data": { - "count": 49, - "num_samples": 862, - "tasks": [], - "average_score": 0.3629944944697668 - }, - "open_ended_output": { - "count": 80, - "num_samples": 1454, - "tasks": [], - "average_score": 0.5713834131825314 - }, - "multiple_choice": { - "count": 85, - "num_samples": 1363, - "tasks": [], - "average_score": 0.39874839531459466 - } - }, - "input_num": { - "6-8 images": { - "count": 21, - "num_samples": 314, - "tasks": [], - "average_score": 0.3359977324263039 - }, - "9-image or more": { - "count": 41, - "num_samples": 623, - "tasks": [], - "average_score": 0.4305788513381019 - }, - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.46343334374251277 - }, - "video": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.45508480503584553 - }, - "4-5 images": { - "count": 34, - "num_samples": 520, - "tasks": [], - "average_score": 0.24651576711552803 - }, - "2-3 images": { - "count": 51, - "num_samples": 802, - "tasks": [], - "average_score": 0.36981497185070983 - } - }, - "app": { - "Information_Extraction": { - "count": 72, - "num_samples": 1124, - "tasks": [], - "average_score": 0.5666618234843734 - }, - "Planning": { - "count": 78, - "num_samples": 1239, - "tasks": [], - "average_score": 0.2420320329702607 - }, - "Coding": { - "count": 31, - "num_samples": 474, - "tasks": [], - "average_score": 0.3458483931206892 - }, - "Perception": { - "count": 145, - "num_samples": 2313, - "tasks": [], - "average_score": 0.43590838051817093 - }, - "Metrics": { - "count": 20, - "num_samples": 309, - "tasks": [], - "average_score": 0.5176671720617656 - }, - "Science": { - "count": 29, - "num_samples": 574, - "tasks": [], - "average_score": 0.3554299482098288 - }, - "Knowledge": { - "count": 97, - "num_samples": 1605, - "tasks": [], - "average_score": 0.5399167524341886 - }, - "Mathematics": { - "count": 33, - "num_samples": 547, - "tasks": [], - "average_score": 0.32918280841495845 - } - } - }, - "Qwen2_VL_72B": { - "skills": { - "Object Recognition and Classification": { - "count": 303, - "num_samples": 4755, - "tasks": [], - "average_score": 0.49787264809826687 - }, - "Text Recognition (OCR)": { - "count": 137, - "num_samples": 2239, - "tasks": [], - "average_score": 0.5439010430283516 - }, - "Language Understanding and Generation": { - "count": 154, - "num_samples": 2509, - "tasks": [], - "average_score": 0.5392244859385411 - }, - "Scene and Event Understanding": { - "count": 154, - "num_samples": 2467, - "tasks": [], - "average_score": 0.509277882172206 - }, - "Mathematical and Logical Reasoning": { - "count": 109, - "num_samples": 1910, - "tasks": [], - "average_score": 0.3776739609562984 - }, - "Commonsense and Social Reasoning": { - "count": 51, - "num_samples": 855, - "tasks": [], - "average_score": 0.5676817981386025 - }, - "Ethical and Safety Reasoning": { - "count": 15, - "num_samples": 245, - "tasks": [], - "average_score": 0.60496992481203 - }, - "Domain-Specific Knowledge and Skills": { - "count": 77, - "num_samples": 1386, - "tasks": [], - "average_score": 0.4633019068994453 - }, - "Spatial and Temporal Reasoning": { - "count": 152, - "num_samples": 2437, - "tasks": [], - "average_score": 0.35105970797600183 - }, - "Planning and Decision Making": { - "count": 37, - "num_samples": 577, - "tasks": [], - "average_score": 0.2201150812944581 - } - }, - "input_format": { - "User Interface Screenshots": { - "count": 93, - "num_samples": 1517, - "tasks": [], - "average_score": 0.5402397677488632 - }, - "Text-Based Images and Documents": { - "count": 82, - "num_samples": 1294, - "tasks": [], - "average_score": 0.4289777675393297 - }, - "Diagrams and Data Visualizations": { - "count": 101, - "num_samples": 1718, - "tasks": [], - "average_score": 0.42094543671351287 - }, - "Videos": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.49943888306036405 - }, - "Artistic and Creative Content": { - "count": 32, - "num_samples": 541, - "tasks": [], - "average_score": 0.507967430369507 - }, - "Photographs": { - "count": 143, - "num_samples": 2248, - "tasks": [], - "average_score": 0.49789939867591104 - }, - "3D Models and Aerial Imagery": { - "count": 11, - "num_samples": 169, - "tasks": [], - "average_score": 0.36212605501536715 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 98, - "num_samples": 1514, - "tasks": [], - "average_score": 0.44719815365440824 - }, - "structured_output": { - "count": 110, - "num_samples": 1714, - "tasks": [], - "average_score": 0.4500902736468407 - }, - "exact_text": { - "count": 83, - "num_samples": 1278, - "tasks": [], - "average_score": 0.5098505660529429 - }, - "numerical_data": { - "count": 49, - "num_samples": 862, - "tasks": [], - "average_score": 0.4027115384266939 - }, - "open_ended_output": { - "count": 80, - "num_samples": 1454, - "tasks": [], - "average_score": 0.5157810622684265 - }, - "multiple_choice": { - "count": 85, - "num_samples": 1363, - "tasks": [], - "average_score": 0.5199940976484408 - } - }, - "input_num": { - "6-8 images": { - "count": 21, - "num_samples": 314, - "tasks": [], - "average_score": 0.3100812547241119 - }, - "9-image or more": { - "count": 41, - "num_samples": 623, - "tasks": [], - "average_score": 0.5468722850464449 - }, - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.4918205178721877 - }, - "video": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.49943888306036405 - }, - "4-5 images": { - "count": 34, - "num_samples": 520, - "tasks": [], - "average_score": 0.36691704884033916 - }, - "2-3 images": { - "count": 51, - "num_samples": 802, - "tasks": [], - "average_score": 0.45176098055218655 - } - }, - "app": { - "Information_Extraction": { - "count": 72, - "num_samples": 1124, - "tasks": [], - "average_score": 0.5807658773593334 - }, - "Planning": { - "count": 78, - "num_samples": 1239, - "tasks": [], - "average_score": 0.31245958897213383 - }, - "Coding": { - "count": 31, - "num_samples": 474, - "tasks": [], - "average_score": 0.4372517645050852 - }, - "Perception": { - "count": 145, - "num_samples": 2313, - "tasks": [], - "average_score": 0.5362106489630868 - }, - "Metrics": { - "count": 20, - "num_samples": 309, - "tasks": [], - "average_score": 0.4968249101570037 - }, - "Science": { - "count": 29, - "num_samples": 574, - "tasks": [], - "average_score": 0.4488852456563113 - }, - "Knowledge": { - "count": 97, - "num_samples": 1605, - "tasks": [], - "average_score": 0.5166939389651373 - }, - "Mathematics": { - "count": 33, - "num_samples": 547, - "tasks": [], - "average_score": 0.31157492395100744 - } - } - }, - "Qwen2_VL_7B": { - "skills": { - "Object Recognition and Classification": { - "count": 303, - "num_samples": 4755, - "tasks": [], - "average_score": 0.3708368629321668 - }, - "Text Recognition (OCR)": { - "count": 137, - "num_samples": 2239, - "tasks": [], - "average_score": 0.40213773918065815 - }, - "Language Understanding and Generation": { - "count": 154, - "num_samples": 2511, - "tasks": [], - "average_score": 0.4034335110538307 - }, - "Scene and Event Understanding": { - "count": 154, - "num_samples": 2469, - "tasks": [], - "average_score": 0.4109909230944937 - }, - "Mathematical and Logical Reasoning": { - "count": 109, - "num_samples": 1910, - "tasks": [], - "average_score": 0.2818925976996871 - }, - "Commonsense and Social Reasoning": { - "count": 51, - "num_samples": 855, - "tasks": [], - "average_score": 0.49360878418945336 - }, - "Ethical and Safety Reasoning": { - "count": 15, - "num_samples": 245, - "tasks": [], - "average_score": 0.5215889724310777 - }, - "Domain-Specific Knowledge and Skills": { - "count": 77, - "num_samples": 1386, - "tasks": [], - "average_score": 0.33309401517140946 - }, - "Spatial and Temporal Reasoning": { - "count": 152, - "num_samples": 2439, - "tasks": [], - "average_score": 0.27564756843599875 - }, - "Planning and Decision Making": { - "count": 37, - "num_samples": 577, - "tasks": [], - "average_score": 0.1473690605854188 - } - }, - "input_format": { - "User Interface Screenshots": { - "count": 93, - "num_samples": 1517, - "tasks": [], - "average_score": 0.3821046882337143 - }, - "Text-Based Images and Documents": { - "count": 82, - "num_samples": 1294, - "tasks": [], - "average_score": 0.2896392967775049 - }, - "Diagrams and Data Visualizations": { - "count": 101, - "num_samples": 1718, - "tasks": [], - "average_score": 0.3223325179806271 - }, - "Videos": { - "count": 43, - "num_samples": 700, - "tasks": [], - "average_score": 0.4111189310485516 - }, - "Artistic and Creative Content": { - "count": 32, - "num_samples": 541, - "tasks": [], - "average_score": 0.34825121621909577 - }, - "Photographs": { - "count": 143, - "num_samples": 2248, - "tasks": [], - "average_score": 0.40660144920567376 - }, - "3D Models and Aerial Imagery": { - "count": 11, - "num_samples": 169, - "tasks": [], - "average_score": 0.262166593895899 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 98, - "num_samples": 1514, - "tasks": [], - "average_score": 0.3430730210869785 - }, - "structured_output": { - "count": 110, - "num_samples": 1714, - "tasks": [], - "average_score": 0.3426196933687219 - }, - "exact_text": { - "count": 83, - "num_samples": 1278, - "tasks": [], - "average_score": 0.35162604166912687 - }, - "numerical_data": { - "count": 49, - "num_samples": 862, - "tasks": [], - "average_score": 0.32665673520415817 - }, - "open_ended_output": { - "count": 80, - "num_samples": 1456, - "tasks": [], - "average_score": 0.3909745200389741 - }, - "multiple_choice": { - "count": 85, - "num_samples": 1363, - "tasks": [], - "average_score": 0.39898011714302023 - } - }, - "input_num": { - "6-8 images": { - "count": 21, - "num_samples": 314, - "tasks": [], - "average_score": 0.19415154950869234 - }, - "9-image or more": { - "count": 41, - "num_samples": 623, - "tasks": [], - "average_score": 0.37453319457428763 - }, - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.37701588079136955 - }, - "video": { - "count": 43, - "num_samples": 700, - "tasks": [], - "average_score": 0.4111189310485516 - }, - "4-5 images": { - "count": 34, - "num_samples": 520, - "tasks": [], - "average_score": 0.26429868057315387 - }, - "2-3 images": { - "count": 51, - "num_samples": 802, - "tasks": [], - "average_score": 0.33008667136891007 - } - }, - "app": { - "Information_Extraction": { - "count": 72, - "num_samples": 1124, - "tasks": [], - "average_score": 0.42746758545520747 - }, - "Planning": { - "count": 78, - "num_samples": 1239, - "tasks": [], - "average_score": 0.2003871750665659 - }, - "Coding": { - "count": 31, - "num_samples": 474, - "tasks": [], - "average_score": 0.3270187644950453 - }, - "Perception": { - "count": 145, - "num_samples": 2315, - "tasks": [], - "average_score": 0.40048749993497734 - }, - "Metrics": { - "count": 20, - "num_samples": 309, - "tasks": [], - "average_score": 0.4245693009859056 - }, - "Science": { - "count": 29, - "num_samples": 574, - "tasks": [], - "average_score": 0.29880557491654197 - }, - "Knowledge": { - "count": 97, - "num_samples": 1605, - "tasks": [], - "average_score": 0.4276637093173368 - }, - "Mathematics": { - "count": 33, - "num_samples": 547, - "tasks": [], - "average_score": 0.25562039051316643 - } - } - }, - "llava_onevision_72B": { - "skills": { - "Object Recognition and Classification": { - "count": 303, - "num_samples": 4755, - "tasks": [], - "average_score": 0.3615741356043519 - }, - "Text Recognition (OCR)": { - "count": 137, - "num_samples": 2239, - "tasks": [], - "average_score": 0.2834675874668524 - }, - "Language Understanding and Generation": { - "count": 154, - "num_samples": 2509, - "tasks": [], - "average_score": 0.3674817002808495 - }, - "Scene and Event Understanding": { - "count": 154, - "num_samples": 2467, - "tasks": [], - "average_score": 0.42146038539739283 - }, - "Mathematical and Logical Reasoning": { - "count": 109, - "num_samples": 1910, - "tasks": [], - "average_score": 0.2951434804409883 - }, - "Commonsense and Social Reasoning": { - "count": 51, - "num_samples": 855, - "tasks": [], - "average_score": 0.478119286755779 - }, - "Ethical and Safety Reasoning": { - "count": 15, - "num_samples": 245, - "tasks": [], - "average_score": 0.6005438596491229 - }, - "Domain-Specific Knowledge and Skills": { - "count": 77, - "num_samples": 1386, - "tasks": [], - "average_score": 0.31663222188988865 - }, - "Spatial and Temporal Reasoning": { - "count": 152, - "num_samples": 2437, - "tasks": [], - "average_score": 0.29633645022129285 - }, - "Planning and Decision Making": { - "count": 37, - "num_samples": 577, - "tasks": [], - "average_score": 0.13872280436872364 - } - }, - "input_format": { - "User Interface Screenshots": { - "count": 93, - "num_samples": 1517, - "tasks": [], - "average_score": 0.23380046931752074 - }, - "Text-Based Images and Documents": { - "count": 82, - "num_samples": 1294, - "tasks": [], - "average_score": 0.2126914943750874 - }, - "Diagrams and Data Visualizations": { - "count": 101, - "num_samples": 1718, - "tasks": [], - "average_score": 0.34566020099204997 - }, - "Videos": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.4446001874842145 - }, - "Artistic and Creative Content": { - "count": 32, - "num_samples": 541, - "tasks": [], - "average_score": 0.4401364830377099 - }, - "Photographs": { - "count": 143, - "num_samples": 2248, - "tasks": [], - "average_score": 0.4247591719013819 - }, - "3D Models and Aerial Imagery": { - "count": 11, - "num_samples": 169, - "tasks": [], - "average_score": 0.23897262553543516 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 98, - "num_samples": 1514, - "tasks": [], - "average_score": 0.2868275930712835 - }, - "structured_output": { - "count": 110, - "num_samples": 1714, - "tasks": [], - "average_score": 0.259450238500612 - }, - "exact_text": { - "count": 83, - "num_samples": 1278, - "tasks": [], - "average_score": 0.370724080249463 - }, - "numerical_data": { - "count": 49, - "num_samples": 862, - "tasks": [], - "average_score": 0.3065719940769206 - }, - "open_ended_output": { - "count": 80, - "num_samples": 1454, - "tasks": [], - "average_score": 0.4293132525502993 - }, - "multiple_choice": { - "count": 85, - "num_samples": 1363, - "tasks": [], - "average_score": 0.3986052416087927 - } - }, - "input_num": { - "6-8 images": { - "count": 21, - "num_samples": 314, - "tasks": [], - "average_score": 0.20730347694633405 - }, - "9-image or more": { - "count": 41, - "num_samples": 623, - "tasks": [], - "average_score": 0.28104747671521785 - }, - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.34840850032295206 - }, - "video": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.4446001874842145 - }, - "4-5 images": { - "count": 34, - "num_samples": 520, - "tasks": [], - "average_score": 0.25013213032747944 - }, - "2-3 images": { - "count": 51, - "num_samples": 802, - "tasks": [], - "average_score": 0.34156793747875674 - } - }, - "app": { - "Information_Extraction": { - "count": 72, - "num_samples": 1124, - "tasks": [], - "average_score": 0.3076421844825067 - }, - "Planning": { - "count": 78, - "num_samples": 1239, - "tasks": [], - "average_score": 0.18168666652660437 - }, - "Coding": { - "count": 31, - "num_samples": 474, - "tasks": [], - "average_score": 0.23240790940031927 - }, - "Perception": { - "count": 145, - "num_samples": 2313, - "tasks": [], - "average_score": 0.38362780453378204 - }, - "Metrics": { - "count": 20, - "num_samples": 309, - "tasks": [], - "average_score": 0.4807891958712894 - }, - "Science": { - "count": 29, - "num_samples": 574, - "tasks": [], - "average_score": 0.31702495228966576 - }, - "Knowledge": { - "count": 97, - "num_samples": 1605, - "tasks": [], - "average_score": 0.4358874880224115 - }, - "Mathematics": { - "count": 33, - "num_samples": 547, - "tasks": [], - "average_score": 0.31588468105075895 - } - } - }, - "llava_onevision_7B": { - "skills": { - "Object Recognition and Classification": { - "count": 303, - "num_samples": 4755, - "tasks": [], - "average_score": 0.2524786809911341 - }, - "Text Recognition (OCR)": { - "count": 137, - "num_samples": 2239, - "tasks": [], - "average_score": 0.19077168655703208 - }, - "Language Understanding and Generation": { - "count": 154, - "num_samples": 2509, - "tasks": [], - "average_score": 0.2555444562659206 - }, - "Scene and Event Understanding": { - "count": 154, - "num_samples": 2467, - "tasks": [], - "average_score": 0.29981286990552625 - }, - "Mathematical and Logical Reasoning": { - "count": 109, - "num_samples": 1910, - "tasks": [], - "average_score": 0.18973491465938852 - }, - "Commonsense and Social Reasoning": { - "count": 51, - "num_samples": 855, - "tasks": [], - "average_score": 0.36842322314565323 - }, - "Ethical and Safety Reasoning": { - "count": 15, - "num_samples": 245, - "tasks": [], - "average_score": 0.44998746867167916 - }, - "Domain-Specific Knowledge and Skills": { - "count": 77, - "num_samples": 1386, - "tasks": [], - "average_score": 0.2445135206648208 - }, - "Spatial and Temporal Reasoning": { - "count": 152, - "num_samples": 2437, - "tasks": [], - "average_score": 0.21802943568344288 - }, - "Planning and Decision Making": { - "count": 37, - "num_samples": 577, - "tasks": [], - "average_score": 0.06658775725427067 - } - }, - "input_format": { - "User Interface Screenshots": { - "count": 93, - "num_samples": 1517, - "tasks": [], - "average_score": 0.1466861610319767 - }, - "Text-Based Images and Documents": { - "count": 82, - "num_samples": 1294, - "tasks": [], - "average_score": 0.13297395577964055 - }, - "Diagrams and Data Visualizations": { - "count": 101, - "num_samples": 1718, - "tasks": [], - "average_score": 0.24236719143449742 - }, - "Videos": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.30985943541023103 - }, - "Artistic and Creative Content": { - "count": 32, - "num_samples": 541, - "tasks": [], - "average_score": 0.3199731020402028 - }, - "Photographs": { - "count": 143, - "num_samples": 2248, - "tasks": [], - "average_score": 0.3263378734842879 - }, - "3D Models and Aerial Imagery": { - "count": 11, - "num_samples": 169, - "tasks": [], - "average_score": 0.13043163858789789 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 98, - "num_samples": 1514, - "tasks": [], - "average_score": 0.20277804188944173 - }, - "structured_output": { - "count": 110, - "num_samples": 1714, - "tasks": [], - "average_score": 0.18291595756285564 - }, - "exact_text": { - "count": 83, - "num_samples": 1278, - "tasks": [], - "average_score": 0.25384794412815426 - }, - "numerical_data": { - "count": 49, - "num_samples": 862, - "tasks": [], - "average_score": 0.2200472229099345 - }, - "open_ended_output": { - "count": 80, - "num_samples": 1454, - "tasks": [], - "average_score": 0.3127341248874411 - }, - "multiple_choice": { - "count": 85, - "num_samples": 1363, - "tasks": [], - "average_score": 0.2802999516721972 - } - }, - "input_num": { - "6-8 images": { - "count": 21, - "num_samples": 314, - "tasks": [], - "average_score": 0.1476473922902494 - }, - "9-image or more": { - "count": 41, - "num_samples": 623, - "tasks": [], - "average_score": 0.13803800801858385 - }, - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.2548084764084038 - }, - "video": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.30985943541023103 - }, - "4-5 images": { - "count": 34, - "num_samples": 520, - "tasks": [], - "average_score": 0.1778991941079372 - }, - "2-3 images": { - "count": 51, - "num_samples": 802, - "tasks": [], - "average_score": 0.2410111891690358 - } - }, - "app": { - "Information_Extraction": { - "count": 72, - "num_samples": 1124, - "tasks": [], - "average_score": 0.19283211154717242 - }, - "Planning": { - "count": 78, - "num_samples": 1239, - "tasks": [], - "average_score": 0.09846926279075068 - }, - "Coding": { - "count": 31, - "num_samples": 474, - "tasks": [], - "average_score": 0.15189414475467605 - }, - "Perception": { - "count": 145, - "num_samples": 2313, - "tasks": [], - "average_score": 0.28505205882578405 - }, - "Metrics": { - "count": 20, - "num_samples": 309, - "tasks": [], - "average_score": 0.3600079950628582 - }, - "Science": { - "count": 29, - "num_samples": 574, - "tasks": [], - "average_score": 0.23654776813656775 - }, - "Knowledge": { - "count": 97, - "num_samples": 1605, - "tasks": [], - "average_score": 0.3271805711561501 - }, - "Mathematics": { - "count": 33, - "num_samples": 547, - "tasks": [], - "average_score": 0.22080546908673507 - } - } - }, - "InternVL2_76B": { - "skills": { - "Object Recognition and Classification": { - "count": 303, - "num_samples": 4755, - "tasks": [], - "average_score": 0.38193012983650343 - }, - "Text Recognition (OCR)": { - "count": 137, - "num_samples": 2239, - "tasks": [], - "average_score": 0.41315219763443384 - }, - "Language Understanding and Generation": { - "count": 154, - "num_samples": 2509, - "tasks": [], - "average_score": 0.43665980552577693 - }, - "Scene and Event Understanding": { - "count": 154, - "num_samples": 2467, - "tasks": [], - "average_score": 0.4265623936500962 - }, - "Mathematical and Logical Reasoning": { - "count": 109, - "num_samples": 1910, - "tasks": [], - "average_score": 0.2975890791763991 - }, - "Commonsense and Social Reasoning": { - "count": 51, - "num_samples": 855, - "tasks": [], - "average_score": 0.5257990949897898 - }, - "Ethical and Safety Reasoning": { - "count": 15, - "num_samples": 245, - "tasks": [], - "average_score": 0.5779473684210527 - }, - "Domain-Specific Knowledge and Skills": { - "count": 77, - "num_samples": 1386, - "tasks": [], - "average_score": 0.33287081421166276 - }, - "Spatial and Temporal Reasoning": { - "count": 152, - "num_samples": 2437, - "tasks": [], - "average_score": 0.2949505390920417 - }, - "Planning and Decision Making": { - "count": 37, - "num_samples": 577, - "tasks": [], - "average_score": 0.17036496432397477 - } - }, - "input_format": { - "User Interface Screenshots": { - "count": 93, - "num_samples": 1517, - "tasks": [], - "average_score": 0.3634339625985008 - }, - "Text-Based Images and Documents": { - "count": 82, - "num_samples": 1294, - "tasks": [], - "average_score": 0.31396468806559114 - }, - "Diagrams and Data Visualizations": { - "count": 101, - "num_samples": 1718, - "tasks": [], - "average_score": 0.3473756113126343 - }, - "Videos": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.395893002855977 - }, - "Artistic and Creative Content": { - "count": 32, - "num_samples": 541, - "tasks": [], - "average_score": 0.44982107744035305 - }, - "Photographs": { - "count": 143, - "num_samples": 2248, - "tasks": [], - "average_score": 0.42875248733027654 - }, - "3D Models and Aerial Imagery": { - "count": 11, - "num_samples": 169, - "tasks": [], - "average_score": 0.2868239162778749 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 98, - "num_samples": 1514, - "tasks": [], - "average_score": 0.3630499545707523 - }, - "structured_output": { - "count": 110, - "num_samples": 1714, - "tasks": [], - "average_score": 0.3476691827105281 - }, - "exact_text": { - "count": 83, - "num_samples": 1278, - "tasks": [], - "average_score": 0.3943337471922549 - }, - "numerical_data": { - "count": 49, - "num_samples": 862, - "tasks": [], - "average_score": 0.29244088978470345 - }, - "open_ended_output": { - "count": 80, - "num_samples": 1454, - "tasks": [], - "average_score": 0.45822072478616577 - }, - "multiple_choice": { - "count": 85, - "num_samples": 1363, - "tasks": [], - "average_score": 0.3879326330400817 - } - }, - "input_num": { - "6-8 images": { - "count": 21, - "num_samples": 314, - "tasks": [], - "average_score": 0.20309901738473166 - }, - "9-image or more": { - "count": 41, - "num_samples": 623, - "tasks": [], - "average_score": 0.34771123515123364 - }, - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.4145693044465943 - }, - "video": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.395893002855977 - }, - "4-5 images": { - "count": 34, - "num_samples": 520, - "tasks": [], - "average_score": 0.24403942809507134 - }, - "2-3 images": { - "count": 51, - "num_samples": 802, - "tasks": [], - "average_score": 0.3153417935059416 - } - }, - "app": { - "Information_Extraction": { - "count": 72, - "num_samples": 1124, - "tasks": [], - "average_score": 0.4306947454508794 - }, - "Planning": { - "count": 78, - "num_samples": 1239, - "tasks": [], - "average_score": 0.2132321995754061 - }, - "Coding": { - "count": 31, - "num_samples": 474, - "tasks": [], - "average_score": 0.2953329718984368 - }, - "Perception": { - "count": 145, - "num_samples": 2313, - "tasks": [], - "average_score": 0.42202934355552685 - }, - "Metrics": { - "count": 20, - "num_samples": 309, - "tasks": [], - "average_score": 0.47409276729986083 - }, - "Science": { - "count": 29, - "num_samples": 574, - "tasks": [], - "average_score": 0.30014798153766264 - }, - "Knowledge": { - "count": 97, - "num_samples": 1605, - "tasks": [], - "average_score": 0.4625649385962016 - }, - "Mathematics": { - "count": 33, - "num_samples": 547, - "tasks": [], - "average_score": 0.2868813944130515 - } - } - }, - "InternVL2_8B": { - "skills": { - "Object Recognition and Classification": { - "count": 303, - "num_samples": 4755, - "tasks": [], - "average_score": 0.2817247716997634 - }, - "Text Recognition (OCR)": { - "count": 137, - "num_samples": 2239, - "tasks": [], - "average_score": 0.280559214034858 - }, - "Language Understanding and Generation": { - "count": 154, - "num_samples": 2511, - "tasks": [], - "average_score": 0.32020728060179815 - }, - "Scene and Event Understanding": { - "count": 154, - "num_samples": 2469, - "tasks": [], - "average_score": 0.325593535916075 - }, - "Mathematical and Logical Reasoning": { - "count": 109, - "num_samples": 1910, - "tasks": [], - "average_score": 0.24118253695139918 - }, - "Commonsense and Social Reasoning": { - "count": 51, - "num_samples": 855, - "tasks": [], - "average_score": 0.39684007367798446 - }, - "Ethical and Safety Reasoning": { - "count": 15, - "num_samples": 245, - "tasks": [], - "average_score": 0.4700852130325815 - }, - "Domain-Specific Knowledge and Skills": { - "count": 77, - "num_samples": 1386, - "tasks": [], - "average_score": 0.27052668526005397 - }, - "Spatial and Temporal Reasoning": { - "count": 152, - "num_samples": 2439, - "tasks": [], - "average_score": 0.23189345356483618 - }, - "Planning and Decision Making": { - "count": 37, - "num_samples": 577, - "tasks": [], - "average_score": 0.08260405712900723 - } - }, - "input_format": { - "User Interface Screenshots": { - "count": 93, - "num_samples": 1517, - "tasks": [], - "average_score": 0.22800928556370195 - }, - "Text-Based Images and Documents": { - "count": 82, - "num_samples": 1294, - "tasks": [], - "average_score": 0.2013779290163996 - }, - "Diagrams and Data Visualizations": { - "count": 101, - "num_samples": 1718, - "tasks": [], - "average_score": 0.2804429603269583 - }, - "Videos": { - "count": 43, - "num_samples": 700, - "tasks": [], - "average_score": 0.34791358240562653 - }, - "Artistic and Creative Content": { - "count": 32, - "num_samples": 541, - "tasks": [], - "average_score": 0.2942163420306113 - }, - "Photographs": { - "count": 143, - "num_samples": 2248, - "tasks": [], - "average_score": 0.3388056726588417 - }, - "3D Models and Aerial Imagery": { - "count": 11, - "num_samples": 169, - "tasks": [], - "average_score": 0.10933317885944857 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 98, - "num_samples": 1514, - "tasks": [], - "average_score": 0.250804626773504 - }, - "structured_output": { - "count": 110, - "num_samples": 1714, - "tasks": [], - "average_score": 0.2522493284864019 - }, - "exact_text": { - "count": 83, - "num_samples": 1278, - "tasks": [], - "average_score": 0.27414636444623874 - }, - "numerical_data": { - "count": 49, - "num_samples": 862, - "tasks": [], - "average_score": 0.22381302045502052 - }, - "open_ended_output": { - "count": 80, - "num_samples": 1456, - "tasks": [], - "average_score": 0.3537549824897016 - }, - "multiple_choice": { - "count": 85, - "num_samples": 1363, - "tasks": [], - "average_score": 0.30261189962428353 - } - }, - "input_num": { - "6-8 images": { - "count": 21, - "num_samples": 314, - "tasks": [], - "average_score": 0.15434618291761149 - }, - "9-image or more": { - "count": 41, - "num_samples": 623, - "tasks": [], - "average_score": 0.19872104324302098 - }, - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.30088711082969344 - }, - "video": { - "count": 43, - "num_samples": 700, - "tasks": [], - "average_score": 0.34791358240562653 - }, - "4-5 images": { - "count": 34, - "num_samples": 520, - "tasks": [], - "average_score": 0.17725087609332119 - }, - "2-3 images": { - "count": 51, - "num_samples": 802, - "tasks": [], - "average_score": 0.2532272454839157 - } - }, - "app": { - "Information_Extraction": { - "count": 72, - "num_samples": 1124, - "tasks": [], - "average_score": 0.29129840423784176 - }, - "Planning": { - "count": 78, - "num_samples": 1239, - "tasks": [], - "average_score": 0.12166926715781588 - }, - "Coding": { - "count": 31, - "num_samples": 474, - "tasks": [], - "average_score": 0.24700310231619527 - }, - "Perception": { - "count": 145, - "num_samples": 2315, - "tasks": [], - "average_score": 0.3214666523378005 - }, - "Metrics": { - "count": 20, - "num_samples": 309, - "tasks": [], - "average_score": 0.3995660275981844 - }, - "Science": { - "count": 29, - "num_samples": 574, - "tasks": [], - "average_score": 0.24614711281861912 - }, - "Knowledge": { - "count": 97, - "num_samples": 1605, - "tasks": [], - "average_score": 0.3393895915929317 - }, - "Mathematics": { - "count": 33, - "num_samples": 547, - "tasks": [], - "average_score": 0.22078333222564453 - } - } - }, - "MiniCPM_v2.6": { - "skills": { - "Object Recognition and Classification": { - "count": 303, - "num_samples": 4755, - "tasks": [], - "average_score": 0.2604967101191775 - }, - "Text Recognition (OCR)": { - "count": 137, - "num_samples": 2239, - "tasks": [], - "average_score": 0.2500331562865158 - }, - "Language Understanding and Generation": { - "count": 154, - "num_samples": 2509, - "tasks": [], - "average_score": 0.3003169369011028 - }, - "Scene and Event Understanding": { - "count": 154, - "num_samples": 2467, - "tasks": [], - "average_score": 0.31808748114668184 - }, - "Mathematical and Logical Reasoning": { - "count": 109, - "num_samples": 1910, - "tasks": [], - "average_score": 0.18281637763548025 - }, - "Commonsense and Social Reasoning": { - "count": 51, - "num_samples": 855, - "tasks": [], - "average_score": 0.40732197204308807 - }, - "Ethical and Safety Reasoning": { - "count": 15, - "num_samples": 245, - "tasks": [], - "average_score": 0.48798245614035085 - }, - "Domain-Specific Knowledge and Skills": { - "count": 77, - "num_samples": 1386, - "tasks": [], - "average_score": 0.23723675736151562 - }, - "Spatial and Temporal Reasoning": { - "count": 152, - "num_samples": 2437, - "tasks": [], - "average_score": 0.1968926733821904 - }, - "Planning and Decision Making": { - "count": 37, - "num_samples": 577, - "tasks": [], - "average_score": 0.08735883237069725 - } - }, - "input_format": { - "User Interface Screenshots": { - "count": 93, - "num_samples": 1517, - "tasks": [], - "average_score": 0.21195711598986072 - }, - "Text-Based Images and Documents": { - "count": 82, - "num_samples": 1294, - "tasks": [], - "average_score": 0.18639148159043903 - }, - "Diagrams and Data Visualizations": { - "count": 101, - "num_samples": 1718, - "tasks": [], - "average_score": 0.21578309681746147 - }, - "Videos": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.3527537836840162 - }, - "Artistic and Creative Content": { - "count": 32, - "num_samples": 541, - "tasks": [], - "average_score": 0.3096882575625531 - }, - "Photographs": { - "count": 143, - "num_samples": 2248, - "tasks": [], - "average_score": 0.3176880312524649 - }, - "3D Models and Aerial Imagery": { - "count": 11, - "num_samples": 169, - "tasks": [], - "average_score": 0.0755920550038197 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 98, - "num_samples": 1514, - "tasks": [], - "average_score": 0.23506388020592064 - }, - "structured_output": { - "count": 110, - "num_samples": 1714, - "tasks": [], - "average_score": 0.1781127776443048 - }, - "exact_text": { - "count": 83, - "num_samples": 1278, - "tasks": [], - "average_score": 0.2551275278138797 - }, - "numerical_data": { - "count": 49, - "num_samples": 862, - "tasks": [], - "average_score": 0.20833171754655547 - }, - "open_ended_output": { - "count": 80, - "num_samples": 1454, - "tasks": [], - "average_score": 0.36473950920880716 - }, - "multiple_choice": { - "count": 85, - "num_samples": 1363, - "tasks": [], - "average_score": 0.293386806641223 - } - }, - "input_num": { - "6-8 images": { - "count": 21, - "num_samples": 314, - "tasks": [], - "average_score": 0.13955971277399848 - }, - "9-image or more": { - "count": 41, - "num_samples": 623, - "tasks": [], - "average_score": 0.23596215721092323 - }, - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.26319603880798287 - }, - "video": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.3527537836840162 - }, - "4-5 images": { - "count": 34, - "num_samples": 520, - "tasks": [], - "average_score": 0.17888270664238365 - }, - "2-3 images": { - "count": 51, - "num_samples": 802, - "tasks": [], - "average_score": 0.22288558250834017 - } - }, - "app": { - "Information_Extraction": { - "count": 72, - "num_samples": 1124, - "tasks": [], - "average_score": 0.2666989364424082 - }, - "Planning": { - "count": 78, - "num_samples": 1239, - "tasks": [], - "average_score": 0.11693267119342445 - }, - "Coding": { - "count": 31, - "num_samples": 474, - "tasks": [], - "average_score": 0.15342045420318667 - }, - "Perception": { - "count": 145, - "num_samples": 2313, - "tasks": [], - "average_score": 0.29243044121840894 - }, - "Metrics": { - "count": 20, - "num_samples": 309, - "tasks": [], - "average_score": 0.3777897246686755 - }, - "Science": { - "count": 29, - "num_samples": 574, - "tasks": [], - "average_score": 0.25714862989687987 - }, - "Knowledge": { - "count": 97, - "num_samples": 1605, - "tasks": [], - "average_score": 0.33187729423141027 - }, - "Mathematics": { - "count": 33, - "num_samples": 547, - "tasks": [], - "average_score": 0.16493399805627715 - } - } - }, - "Phi-3.5-vision": { - "skills": { - "Object Recognition and Classification": { - "count": 303, - "num_samples": 4755, - "tasks": [], - "average_score": 0.2551037902226636 - }, - "Text Recognition (OCR)": { - "count": 137, - "num_samples": 2239, - "tasks": [], - "average_score": 0.2483252111012436 - }, - "Language Understanding and Generation": { - "count": 154, - "num_samples": 2509, - "tasks": [], - "average_score": 0.28732942108098564 - }, - "Scene and Event Understanding": { - "count": 154, - "num_samples": 2467, - "tasks": [], - "average_score": 0.3049602749093698 - }, - "Mathematical and Logical Reasoning": { - "count": 109, - "num_samples": 1910, - "tasks": [], - "average_score": 0.21653804346780042 - }, - "Commonsense and Social Reasoning": { - "count": 51, - "num_samples": 855, - "tasks": [], - "average_score": 0.36823084724842464 - }, - "Ethical and Safety Reasoning": { - "count": 15, - "num_samples": 245, - "tasks": [], - "average_score": 0.46663157894736845 - }, - "Domain-Specific Knowledge and Skills": { - "count": 77, - "num_samples": 1386, - "tasks": [], - "average_score": 0.24145330077248778 - }, - "Spatial and Temporal Reasoning": { - "count": 152, - "num_samples": 2437, - "tasks": [], - "average_score": 0.2154692063816354 - }, - "Planning and Decision Making": { - "count": 37, - "num_samples": 577, - "tasks": [], - "average_score": 0.08944481289041872 - } - }, - "input_format": { - "User Interface Screenshots": { - "count": 93, - "num_samples": 1517, - "tasks": [], - "average_score": 0.1865974025588298 - }, - "Text-Based Images and Documents": { - "count": 82, - "num_samples": 1294, - "tasks": [], - "average_score": 0.17497379027990792 - }, - "Diagrams and Data Visualizations": { - "count": 101, - "num_samples": 1718, - "tasks": [], - "average_score": 0.26053460127801603 - }, - "Videos": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.24669318645450836 - }, - "Artistic and Creative Content": { - "count": 32, - "num_samples": 541, - "tasks": [], - "average_score": 0.2786226802221388 - }, - "Photographs": { - "count": 143, - "num_samples": 2248, - "tasks": [], - "average_score": 0.3413768635559215 - }, - "3D Models and Aerial Imagery": { - "count": 11, - "num_samples": 169, - "tasks": [], - "average_score": 0.15444746077692828 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 98, - "num_samples": 1514, - "tasks": [], - "average_score": 0.2177924712685756 - }, - "structured_output": { - "count": 110, - "num_samples": 1714, - "tasks": [], - "average_score": 0.21443984349574025 - }, - "exact_text": { - "count": 83, - "num_samples": 1278, - "tasks": [], - "average_score": 0.2572371188897671 - }, - "numerical_data": { - "count": 49, - "num_samples": 862, - "tasks": [], - "average_score": 0.21409351002477045 - }, - "open_ended_output": { - "count": 80, - "num_samples": 1454, - "tasks": [], - "average_score": 0.365192668303297 - }, - "multiple_choice": { - "count": 85, - "num_samples": 1363, - "tasks": [], - "average_score": 0.25960269434727634 - } - }, - "input_num": { - "6-8 images": { - "count": 21, - "num_samples": 314, - "tasks": [], - "average_score": 0.12546296296296297 - }, - "9-image or more": { - "count": 41, - "num_samples": 623, - "tasks": [], - "average_score": 0.14337869666229008 - }, - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.27790147494714373 - }, - "video": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.24669318645450836 - }, - "4-5 images": { - "count": 34, - "num_samples": 520, - "tasks": [], - "average_score": 0.20168001345379397 - }, - "2-3 images": { - "count": 51, - "num_samples": 802, - "tasks": [], - "average_score": 0.2850550871176333 - } - }, - "app": { - "Information_Extraction": { - "count": 72, - "num_samples": 1124, - "tasks": [], - "average_score": 0.2237087834389946 - }, - "Planning": { - "count": 78, - "num_samples": 1239, - "tasks": [], - "average_score": 0.08928724806836039 - }, - "Coding": { - "count": 31, - "num_samples": 474, - "tasks": [], - "average_score": 0.219367263034246 - }, - "Perception": { - "count": 145, - "num_samples": 2313, - "tasks": [], - "average_score": 0.316318567258608 - }, - "Metrics": { - "count": 20, - "num_samples": 309, - "tasks": [], - "average_score": 0.3945898792928062 - }, - "Science": { - "count": 29, - "num_samples": 574, - "tasks": [], - "average_score": 0.21925278489551242 - }, - "Knowledge": { - "count": 97, - "num_samples": 1605, - "tasks": [], - "average_score": 0.33264696401038385 - }, - "Mathematics": { - "count": 33, - "num_samples": 547, - "tasks": [], - "average_score": 0.17575913004138646 - } - } - }, - "Pixtral_12B": { - "skills": { - "Object Recognition and Classification": { - "count": 303, - "num_samples": 4755, - "tasks": [], - "average_score": 0.3460288961410444 - }, - "Text Recognition (OCR)": { - "count": 137, - "num_samples": 2239, - "tasks": [], - "average_score": 0.3777640755922415 - }, - "Language Understanding and Generation": { - "count": 154, - "num_samples": 2509, - "tasks": [], - "average_score": 0.38299418297106824 - }, - "Scene and Event Understanding": { - "count": 154, - "num_samples": 2467, - "tasks": [], - "average_score": 0.3776722463473817 - }, - "Mathematical and Logical Reasoning": { - "count": 109, - "num_samples": 1910, - "tasks": [], - "average_score": 0.2828575553466608 - }, - "Commonsense and Social Reasoning": { - "count": 51, - "num_samples": 855, - "tasks": [], - "average_score": 0.419071767659191 - }, - "Ethical and Safety Reasoning": { - "count": 15, - "num_samples": 245, - "tasks": [], - "average_score": 0.5687919799498747 - }, - "Domain-Specific Knowledge and Skills": { - "count": 77, - "num_samples": 1386, - "tasks": [], - "average_score": 0.32813540763467464 - }, - "Spatial and Temporal Reasoning": { - "count": 152, - "num_samples": 2437, - "tasks": [], - "average_score": 0.2677293131171651 - }, - "Planning and Decision Making": { - "count": 37, - "num_samples": 577, - "tasks": [], - "average_score": 0.10591240329992047 - } - }, - "input_format": { - "User Interface Screenshots": { - "count": 93, - "num_samples": 1517, - "tasks": [], - "average_score": 0.3070067338940785 - }, - "Text-Based Images and Documents": { - "count": 82, - "num_samples": 1294, - "tasks": [], - "average_score": 0.28832738144368647 - }, - "Diagrams and Data Visualizations": { - "count": 101, - "num_samples": 1718, - "tasks": [], - "average_score": 0.3223299098375932 - }, - "Videos": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.409643099998057 - }, - "Artistic and Creative Content": { - "count": 32, - "num_samples": 541, - "tasks": [], - "average_score": 0.37450808136321684 - }, - "Photographs": { - "count": 143, - "num_samples": 2248, - "tasks": [], - "average_score": 0.37115973962368864 - }, - "3D Models and Aerial Imagery": { - "count": 11, - "num_samples": 169, - "tasks": [], - "average_score": 0.24009431093278263 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 98, - "num_samples": 1514, - "tasks": [], - "average_score": 0.3078181788009137 - }, - "structured_output": { - "count": 110, - "num_samples": 1714, - "tasks": [], - "average_score": 0.3188475653127356 - }, - "exact_text": { - "count": 83, - "num_samples": 1278, - "tasks": [], - "average_score": 0.3639544140938305 - }, - "numerical_data": { - "count": 49, - "num_samples": 862, - "tasks": [], - "average_score": 0.32073418701669026 - }, - "open_ended_output": { - "count": 80, - "num_samples": 1454, - "tasks": [], - "average_score": 0.4166613092238043 - }, - "multiple_choice": { - "count": 85, - "num_samples": 1363, - "tasks": [], - "average_score": 0.3008126415966517 - } - }, - "input_num": { - "6-8 images": { - "count": 21, - "num_samples": 314, - "tasks": [], - "average_score": 0.19743008314436883 - }, - "9-image or more": { - "count": 41, - "num_samples": 623, - "tasks": [], - "average_score": 0.16642294307267227 - }, - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.37108130557306335 - }, - "video": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.409643099998057 - }, - "4-5 images": { - "count": 34, - "num_samples": 520, - "tasks": [], - "average_score": 0.2575699315401612 - }, - "2-3 images": { - "count": 51, - "num_samples": 802, - "tasks": [], - "average_score": 0.3104621543981899 - } - }, - "app": { - "Information_Extraction": { - "count": 72, - "num_samples": 1124, - "tasks": [], - "average_score": 0.4300741596942578 - }, - "Planning": { - "count": 78, - "num_samples": 1239, - "tasks": [], - "average_score": 0.13622980866275425 - }, - "Coding": { - "count": 31, - "num_samples": 474, - "tasks": [], - "average_score": 0.2572414987500377 - }, - "Perception": { - "count": 145, - "num_samples": 2313, - "tasks": [], - "average_score": 0.3892097218585385 - }, - "Metrics": { - "count": 20, - "num_samples": 309, - "tasks": [], - "average_score": 0.5020540387409291 - }, - "Science": { - "count": 29, - "num_samples": 574, - "tasks": [], - "average_score": 0.31301986568151985 - }, - "Knowledge": { - "count": 97, - "num_samples": 1605, - "tasks": [], - "average_score": 0.3809515410188075 - }, - "Mathematics": { - "count": 33, - "num_samples": 547, - "tasks": [], - "average_score": 0.24222628640267738 - } - } - }, - "Llama_3_2_11B": { - "skills": { - "Object Recognition and Classification": { - "count": 303, - "num_samples": 4755, - "tasks": [], - "average_score": 0.1907604552173455 - }, - "Text Recognition (OCR)": { - "count": 137, - "num_samples": 2239, - "tasks": [], - "average_score": 0.14328677752263275 - }, - "Language Understanding and Generation": { - "count": 154, - "num_samples": 2509, - "tasks": [], - "average_score": 0.19646404502647707 - }, - "Scene and Event Understanding": { - "count": 154, - "num_samples": 2467, - "tasks": [], - "average_score": 0.22399113135844315 - }, - "Mathematical and Logical Reasoning": { - "count": 109, - "num_samples": 1910, - "tasks": [], - "average_score": 0.13303760019716085 - }, - "Commonsense and Social Reasoning": { - "count": 51, - "num_samples": 855, - "tasks": [], - "average_score": 0.323153603297999 - }, - "Ethical and Safety Reasoning": { - "count": 15, - "num_samples": 245, - "tasks": [], - "average_score": 0.4260501253132832 - }, - "Domain-Specific Knowledge and Skills": { - "count": 77, - "num_samples": 1386, - "tasks": [], - "average_score": 0.1770852858056774 - }, - "Spatial and Temporal Reasoning": { - "count": 152, - "num_samples": 2437, - "tasks": [], - "average_score": 0.15366454315378308 - }, - "Planning and Decision Making": { - "count": 37, - "num_samples": 577, - "tasks": [], - "average_score": 0.06563884729522687 - } - }, - "input_format": { - "User Interface Screenshots": { - "count": 93, - "num_samples": 1517, - "tasks": [], - "average_score": 0.11886347847341794 - }, - "Text-Based Images and Documents": { - "count": 82, - "num_samples": 1294, - "tasks": [], - "average_score": 0.11489351406848371 - }, - "Diagrams and Data Visualizations": { - "count": 101, - "num_samples": 1718, - "tasks": [], - "average_score": 0.1693681214060816 - }, - "Videos": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.2123769209846321 - }, - "Artistic and Creative Content": { - "count": 32, - "num_samples": 541, - "tasks": [], - "average_score": 0.2520175802062012 - }, - "Photographs": { - "count": 143, - "num_samples": 2248, - "tasks": [], - "average_score": 0.2485354956932213 - }, - "3D Models and Aerial Imagery": { - "count": 11, - "num_samples": 169, - "tasks": [], - "average_score": 0.06418655520777307 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 98, - "num_samples": 1514, - "tasks": [], - "average_score": 0.12417283740525839 - }, - "structured_output": { - "count": 110, - "num_samples": 1714, - "tasks": [], - "average_score": 0.16374180545556977 - }, - "exact_text": { - "count": 83, - "num_samples": 1278, - "tasks": [], - "average_score": 0.1576236804437753 - }, - "numerical_data": { - "count": 49, - "num_samples": 862, - "tasks": [], - "average_score": 0.15014439824913947 - }, - "open_ended_output": { - "count": 80, - "num_samples": 1454, - "tasks": [], - "average_score": 0.3003142292328822 - }, - "multiple_choice": { - "count": 85, - "num_samples": 1363, - "tasks": [], - "average_score": 0.19270157739425633 - } - }, - "input_num": { - "6-8 images": { - "count": 21, - "num_samples": 314, - "tasks": [], - "average_score": 0.1463246409674981 - }, - "9-image or more": { - "count": 41, - "num_samples": 623, - "tasks": [], - "average_score": 0.0732004839476103 - }, - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.1960107191983825 - }, - "video": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.2123769209846321 - }, - "4-5 images": { - "count": 34, - "num_samples": 520, - "tasks": [], - "average_score": 0.1351857051327849 - }, - "2-3 images": { - "count": 51, - "num_samples": 802, - "tasks": [], - "average_score": 0.18586695387250338 - } - }, - "app": { - "Information_Extraction": { - "count": 72, - "num_samples": 1124, - "tasks": [], - "average_score": 0.17288724679416761 - }, - "Planning": { - "count": 78, - "num_samples": 1239, - "tasks": [], - "average_score": 0.08100042975820579 - }, - "Coding": { - "count": 31, - "num_samples": 474, - "tasks": [], - "average_score": 0.0575426944971537 - }, - "Perception": { - "count": 145, - "num_samples": 2313, - "tasks": [], - "average_score": 0.19899465185565898 - }, - "Metrics": { - "count": 20, - "num_samples": 309, - "tasks": [], - "average_score": 0.254316961351997 - }, - "Science": { - "count": 29, - "num_samples": 574, - "tasks": [], - "average_score": 0.162801811963855 - }, - "Knowledge": { - "count": 97, - "num_samples": 1605, - "tasks": [], - "average_score": 0.28055776664538923 - }, - "Mathematics": { - "count": 33, - "num_samples": 547, - "tasks": [], - "average_score": 0.13937853323074623 - } - } - }, - "Idefics3": { - "skills": { - "Object Recognition and Classification": { - "count": 303, - "num_samples": 4755, - "tasks": [], - "average_score": 0.14507788965553362 - }, - "Text Recognition (OCR)": { - "count": 137, - "num_samples": 2239, - "tasks": [], - "average_score": 0.11641535161320743 - }, - "Language Understanding and Generation": { - "count": 154, - "num_samples": 2509, - "tasks": [], - "average_score": 0.17255583910766542 - }, - "Scene and Event Understanding": { - "count": 154, - "num_samples": 2467, - "tasks": [], - "average_score": 0.14745217246476708 - }, - "Mathematical and Logical Reasoning": { - "count": 109, - "num_samples": 1910, - "tasks": [], - "average_score": 0.1331851390883708 - }, - "Commonsense and Social Reasoning": { - "count": 51, - "num_samples": 855, - "tasks": [], - "average_score": 0.19221534222332276 - }, - "Ethical and Safety Reasoning": { - "count": 15, - "num_samples": 245, - "tasks": [], - "average_score": 0.28640852130325817 - }, - "Domain-Specific Knowledge and Skills": { - "count": 77, - "num_samples": 1386, - "tasks": [], - "average_score": 0.17906399043310475 - }, - "Spatial and Temporal Reasoning": { - "count": 152, - "num_samples": 2437, - "tasks": [], - "average_score": 0.10192930055370109 - }, - "Planning and Decision Making": { - "count": 37, - "num_samples": 577, - "tasks": [], - "average_score": 0.04211916597550756 - } - }, - "input_format": { - "User Interface Screenshots": { - "count": 93, - "num_samples": 1517, - "tasks": [], - "average_score": 0.10126271262360581 - }, - "Text-Based Images and Documents": { - "count": 82, - "num_samples": 1294, - "tasks": [], - "average_score": 0.11407926733108291 - }, - "Diagrams and Data Visualizations": { - "count": 101, - "num_samples": 1718, - "tasks": [], - "average_score": 0.16225217317782772 - }, - "Videos": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.16181866973635636 - }, - "Artistic and Creative Content": { - "count": 32, - "num_samples": 541, - "tasks": [], - "average_score": 0.1839408679813373 - }, - "Photographs": { - "count": 143, - "num_samples": 2248, - "tasks": [], - "average_score": 0.14933801491626408 - }, - "3D Models and Aerial Imagery": { - "count": 11, - "num_samples": 169, - "tasks": [], - "average_score": 0.0395540896656236 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 98, - "num_samples": 1514, - "tasks": [], - "average_score": 0.13979628998424784 - }, - "structured_output": { - "count": 110, - "num_samples": 1714, - "tasks": [], - "average_score": 0.1062779093260333 - }, - "exact_text": { - "count": 83, - "num_samples": 1278, - "tasks": [], - "average_score": 0.07053056796593082 - }, - "numerical_data": { - "count": 49, - "num_samples": 862, - "tasks": [], - "average_score": 0.09790172378722654 - }, - "open_ended_output": { - "count": 80, - "num_samples": 1454, - "tasks": [], - "average_score": 0.2987797010800956 - }, - "multiple_choice": { - "count": 85, - "num_samples": 1363, - "tasks": [], - "average_score": 0.11588163814170001 - } - }, - "input_num": { - "6-8 images": { - "count": 21, - "num_samples": 314, - "tasks": [], - "average_score": 0.1008692365835223 - }, - "9-image or more": { - "count": 41, - "num_samples": 623, - "tasks": [], - "average_score": 0.09308121224497533 - }, - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.14757589734485796 - }, - "video": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.16181866973635636 - }, - "4-5 images": { - "count": 34, - "num_samples": 520, - "tasks": [], - "average_score": 0.12217834249866026 - }, - "2-3 images": { - "count": 51, - "num_samples": 802, - "tasks": [], - "average_score": 0.12276246278377517 - } - }, - "app": { - "Information_Extraction": { - "count": 72, - "num_samples": 1124, - "tasks": [], - "average_score": 0.14743542163139847 - }, - "Planning": { - "count": 78, - "num_samples": 1239, - "tasks": [], - "average_score": 0.05354869594691955 - }, - "Coding": { - "count": 31, - "num_samples": 474, - "tasks": [], - "average_score": 0.09065540194572455 - }, - "Perception": { - "count": 145, - "num_samples": 2313, - "tasks": [], - "average_score": 0.1463280929280822 - }, - "Metrics": { - "count": 20, - "num_samples": 309, - "tasks": [], - "average_score": 0.14564374862578883 - }, - "Science": { - "count": 29, - "num_samples": 574, - "tasks": [], - "average_score": 0.22748773785486257 - }, - "Knowledge": { - "count": 97, - "num_samples": 1605, - "tasks": [], - "average_score": 0.17647756032677067 - }, - "Mathematics": { - "count": 33, - "num_samples": 547, - "tasks": [], - "average_score": 0.13168972973651977 - } - } - }, - "Aria": { - "skills": { - "Object Recognition and Classification": { - "count": 303, - "num_samples": 4755, - "tasks": [], - "average_score": 0.3264829094772722 - }, - "Text Recognition (OCR)": { - "count": 137, - "num_samples": 2239, - "tasks": [], - "average_score": 0.35712138797286674 - }, - "Language Understanding and Generation": { - "count": 154, - "num_samples": 2509, - "tasks": [], - "average_score": 0.4004806395853317 - }, - "Scene and Event Understanding": { - "count": 154, - "num_samples": 2467, - "tasks": [], - "average_score": 0.3783082688258977 - }, - "Mathematical and Logical Reasoning": { - "count": 109, - "num_samples": 1910, - "tasks": [], - "average_score": 0.27628131703993153 - }, - "Commonsense and Social Reasoning": { - "count": 51, - "num_samples": 855, - "tasks": [], - "average_score": 0.4942870225393938 - }, - "Ethical and Safety Reasoning": { - "count": 15, - "num_samples": 245, - "tasks": [], - "average_score": 0.5811228070175439 - }, - "Domain-Specific Knowledge and Skills": { - "count": 77, - "num_samples": 1386, - "tasks": [], - "average_score": 0.3279996334048362 - }, - "Spatial and Temporal Reasoning": { - "count": 152, - "num_samples": 2437, - "tasks": [], - "average_score": 0.2481896092177717 - }, - "Planning and Decision Making": { - "count": 37, - "num_samples": 577, - "tasks": [], - "average_score": 0.11945216302285933 - } - }, - "input_format": { - "User Interface Screenshots": { - "count": 93, - "num_samples": 1517, - "tasks": [], - "average_score": 0.2830308005758272 - }, - "Text-Based Images and Documents": { - "count": 82, - "num_samples": 1294, - "tasks": [], - "average_score": 0.27833423130489043 - }, - "Diagrams and Data Visualizations": { - "count": 101, - "num_samples": 1718, - "tasks": [], - "average_score": 0.32371820359400666 - }, - "Videos": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.42875359425696014 - }, - "Artistic and Creative Content": { - "count": 32, - "num_samples": 541, - "tasks": [], - "average_score": 0.3612041984219992 - }, - "Photographs": { - "count": 143, - "num_samples": 2248, - "tasks": [], - "average_score": 0.37290568595471846 - }, - "3D Models and Aerial Imagery": { - "count": 11, - "num_samples": 169, - "tasks": [], - "average_score": 0.19554976321164697 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 98, - "num_samples": 1514, - "tasks": [], - "average_score": 0.3092653492193887 - }, - "structured_output": { - "count": 110, - "num_samples": 1714, - "tasks": [], - "average_score": 0.3043751656077328 - }, - "exact_text": { - "count": 83, - "num_samples": 1278, - "tasks": [], - "average_score": 0.2930015244066511 - }, - "numerical_data": { - "count": 49, - "num_samples": 862, - "tasks": [], - "average_score": 0.3092167834876797 - }, - "open_ended_output": { - "count": 80, - "num_samples": 1454, - "tasks": [], - "average_score": 0.4523860109667709 - }, - "multiple_choice": { - "count": 85, - "num_samples": 1363, - "tasks": [], - "average_score": 0.3277812604542708 - } - }, - "input_num": { - "6-8 images": { - "count": 21, - "num_samples": 314, - "tasks": [], - "average_score": 0.21139455782312927 - }, - "9-image or more": { - "count": 41, - "num_samples": 623, - "tasks": [], - "average_score": 0.2711617723374526 - }, - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.3576735443060994 - }, - "video": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.42875359425696014 - }, - "4-5 images": { - "count": 34, - "num_samples": 520, - "tasks": [], - "average_score": 0.19839956701033565 - }, - "2-3 images": { - "count": 51, - "num_samples": 802, - "tasks": [], - "average_score": 0.27267126872569447 - } - }, - "app": { - "Information_Extraction": { - "count": 72, - "num_samples": 1124, - "tasks": [], - "average_score": 0.38321397541649777 - }, - "Planning": { - "count": 78, - "num_samples": 1239, - "tasks": [], - "average_score": 0.14301905320436192 - }, - "Coding": { - "count": 31, - "num_samples": 474, - "tasks": [], - "average_score": 0.2849545194421855 - }, - "Perception": { - "count": 145, - "num_samples": 2313, - "tasks": [], - "average_score": 0.3779947327886569 - }, - "Metrics": { - "count": 20, - "num_samples": 309, - "tasks": [], - "average_score": 0.39678729061309725 - }, - "Science": { - "count": 29, - "num_samples": 574, - "tasks": [], - "average_score": 0.29682445889316517 - }, - "Knowledge": { - "count": 97, - "num_samples": 1605, - "tasks": [], - "average_score": 0.4096377585306089 - }, - "Mathematics": { - "count": 33, - "num_samples": 547, - "tasks": [], - "average_score": 0.26194160419181234 - } - } - }, - "NVLM": { - "skills": { - "Object Recognition and Classification": { - "count": 303, - "num_samples": 4755, - "tasks": [], - "average_score": 0.24033557047857043 - }, - "Text Recognition (OCR)": { - "count": 137, - "num_samples": 2239, - "tasks": [], - "average_score": 0.32154059695494047 - }, - "Language Understanding and Generation": { - "count": 154, - "num_samples": 2509, - "tasks": [], - "average_score": 0.2937052996171993 - }, - "Scene and Event Understanding": { - "count": 154, - "num_samples": 2467, - "tasks": [], - "average_score": 0.22845955700594492 - }, - "Mathematical and Logical Reasoning": { - "count": 109, - "num_samples": 1910, - "tasks": [], - "average_score": 0.2639741933075709 - }, - "Commonsense and Social Reasoning": { - "count": 51, - "num_samples": 855, - "tasks": [], - "average_score": 0.40870864071047447 - }, - "Ethical and Safety Reasoning": { - "count": 15, - "num_samples": 245, - "tasks": [], - "average_score": 0.4555238095238095 - }, - "Domain-Specific Knowledge and Skills": { - "count": 77, - "num_samples": 1386, - "tasks": [], - "average_score": 0.25785191641267197 - }, - "Spatial and Temporal Reasoning": { - "count": 152, - "num_samples": 2437, - "tasks": [], - "average_score": 0.15679681195908274 - }, - "Planning and Decision Making": { - "count": 37, - "num_samples": 577, - "tasks": [], - "average_score": 0.0672259242345112 - } - }, - "input_format": { - "User Interface Screenshots": { - "count": 93, - "num_samples": 1517, - "tasks": [], - "average_score": 0.23922823287047076 - }, - "Text-Based Images and Documents": { - "count": 82, - "num_samples": 1294, - "tasks": [], - "average_score": 0.21734036617042948 - }, - "Diagrams and Data Visualizations": { - "count": 101, - "num_samples": 1718, - "tasks": [], - "average_score": 0.30313485498585124 - }, - "Videos": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.0 - }, - "Artistic and Creative Content": { - "count": 32, - "num_samples": 541, - "tasks": [], - "average_score": 0.34726189956094355 - }, - "Photographs": { - "count": 143, - "num_samples": 2248, - "tasks": [], - "average_score": 0.3264757655296162 - }, - "3D Models and Aerial Imagery": { - "count": 11, - "num_samples": 169, - "tasks": [], - "average_score": 0.056894830390305184 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 98, - "num_samples": 1514, - "tasks": [], - "average_score": 0.22868389095927066 - }, - "structured_output": { - "count": 110, - "num_samples": 1714, - "tasks": [], - "average_score": 0.2788963949121424 - }, - "exact_text": { - "count": 83, - "num_samples": 1278, - "tasks": [], - "average_score": 0.2787764976961992 - }, - "numerical_data": { - "count": 49, - "num_samples": 862, - "tasks": [], - "average_score": 0.23349712171444964 - }, - "open_ended_output": { - "count": 80, - "num_samples": 1454, - "tasks": [], - "average_score": 0.3215948035793096 - }, - "multiple_choice": { - "count": 85, - "num_samples": 1363, - "tasks": [], - "average_score": 0.18487055428231897 - } - }, - "input_num": { - "6-8 images": { - "count": 21, - "num_samples": 314, - "tasks": [], - "average_score": 0.0 - }, - "9-image or more": { - "count": 41, - "num_samples": 623, - "tasks": [], - "average_score": 0.0 - }, - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.3680809151131777 - }, - "video": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.0 - }, - "4-5 images": { - "count": 34, - "num_samples": 520, - "tasks": [], - "average_score": 0.03838410364145658 - }, - "2-3 images": { - "count": 51, - "num_samples": 802, - "tasks": [], - "average_score": 0.2325581694709435 - } - }, - "app": { - "Information_Extraction": { - "count": 72, - "num_samples": 1124, - "tasks": [], - "average_score": 0.22773778915303383 - }, - "Planning": { - "count": 78, - "num_samples": 1239, - "tasks": [], - "average_score": 0.08048160660797504 - }, - "Coding": { - "count": 31, - "num_samples": 474, - "tasks": [], - "average_score": 0.2390024647851972 - }, - "Perception": { - "count": 145, - "num_samples": 2313, - "tasks": [], - "average_score": 0.30211261814126533 - }, - "Metrics": { - "count": 20, - "num_samples": 309, - "tasks": [], - "average_score": 0.18857142857142856 - }, - "Science": { - "count": 29, - "num_samples": 574, - "tasks": [], - "average_score": 0.24908307640275493 - }, - "Knowledge": { - "count": 97, - "num_samples": 1605, - "tasks": [], - "average_score": 0.3724877947012685 - }, - "Mathematics": { - "count": 33, - "num_samples": 547, - "tasks": [], - "average_score": 0.24529601154794037 - } - } - }, - "InternVL2_2B": { - "skills": { - "Object Recognition and Classification": { - "count": 303, - "num_samples": 4755, - "tasks": [], - "average_score": 0.14491178903291552 - }, - "Text Recognition (OCR)": { - "count": 137, - "num_samples": 2239, - "tasks": [], - "average_score": 0.12126906675624163 - }, - "Language Understanding and Generation": { - "count": 154, - "num_samples": 2509, - "tasks": [], - "average_score": 0.16912754929321935 - }, - "Scene and Event Understanding": { - "count": 154, - "num_samples": 2467, - "tasks": [], - "average_score": 0.18542274192083463 - }, - "Mathematical and Logical Reasoning": { - "count": 109, - "num_samples": 1910, - "tasks": [], - "average_score": 0.13923308734553164 - }, - "Commonsense and Social Reasoning": { - "count": 51, - "num_samples": 855, - "tasks": [], - "average_score": 0.23992252224543772 - }, - "Ethical and Safety Reasoning": { - "count": 15, - "num_samples": 245, - "tasks": [], - "average_score": 0.3420927318295739 - }, - "Domain-Specific Knowledge and Skills": { - "count": 77, - "num_samples": 1386, - "tasks": [], - "average_score": 0.14807577209152425 - }, - "Spatial and Temporal Reasoning": { - "count": 152, - "num_samples": 2437, - "tasks": [], - "average_score": 0.13036555933925006 - }, - "Planning and Decision Making": { - "count": 37, - "num_samples": 577, - "tasks": [], - "average_score": 0.01727799227799228 - } - }, - "input_format": { - "User Interface Screenshots": { - "count": 93, - "num_samples": 1517, - "tasks": [], - "average_score": 0.057021136657850864 - }, - "Text-Based Images and Documents": { - "count": 82, - "num_samples": 1294, - "tasks": [], - "average_score": 0.10504085961245285 - }, - "Diagrams and Data Visualizations": { - "count": 101, - "num_samples": 1718, - "tasks": [], - "average_score": 0.1625198552182714 - }, - "Videos": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.18999779001767986 - }, - "Artistic and Creative Content": { - "count": 32, - "num_samples": 541, - "tasks": [], - "average_score": 0.1487677475708977 - }, - "Photographs": { - "count": 143, - "num_samples": 2248, - "tasks": [], - "average_score": 0.2011727338536935 - }, - "3D Models and Aerial Imagery": { - "count": 11, - "num_samples": 169, - "tasks": [], - "average_score": 0.11886936592818943 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 98, - "num_samples": 1514, - "tasks": [], - "average_score": 0.1131404778887607 - }, - "structured_output": { - "count": 110, - "num_samples": 1714, - "tasks": [], - "average_score": 0.05739750616837997 - }, - "exact_text": { - "count": 83, - "num_samples": 1278, - "tasks": [], - "average_score": 0.15465451663650032 - }, - "numerical_data": { - "count": 49, - "num_samples": 862, - "tasks": [], - "average_score": 0.16044698450090833 - }, - "open_ended_output": { - "count": 80, - "num_samples": 1454, - "tasks": [], - "average_score": 0.21429521387724249 - }, - "multiple_choice": { - "count": 85, - "num_samples": 1363, - "tasks": [], - "average_score": 0.2128614316540013 - } - }, - "input_num": { - "6-8 images": { - "count": 21, - "num_samples": 314, - "tasks": [], - "average_score": 0.03658352229780801 - }, - "9-image or more": { - "count": 41, - "num_samples": 623, - "tasks": [], - "average_score": 0.05757839721254354 - }, - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.15225683687839608 - }, - "video": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.18999779001767986 - }, - "4-5 images": { - "count": 34, - "num_samples": 520, - "tasks": [], - "average_score": 0.17677460549936644 - }, - "2-3 images": { - "count": 51, - "num_samples": 802, - "tasks": [], - "average_score": 0.158165588340436 - } - }, - "app": { - "Information_Extraction": { - "count": 72, - "num_samples": 1124, - "tasks": [], - "average_score": 0.08722661966805 - }, - "Planning": { - "count": 78, - "num_samples": 1239, - "tasks": [], - "average_score": 0.04102853815875594 - }, - "Coding": { - "count": 31, - "num_samples": 474, - "tasks": [], - "average_score": 0.11264043251709285 - }, - "Perception": { - "count": 145, - "num_samples": 2313, - "tasks": [], - "average_score": 0.17001758160301803 - }, - "Metrics": { - "count": 20, - "num_samples": 309, - "tasks": [], - "average_score": 0.3332891958712894 - }, - "Science": { - "count": 29, - "num_samples": 574, - "tasks": [], - "average_score": 0.1686125516807394 - }, - "Knowledge": { - "count": 97, - "num_samples": 1605, - "tasks": [], - "average_score": 0.21169137106199268 - }, - "Mathematics": { - "count": 33, - "num_samples": 547, - "tasks": [], - "average_score": 0.10975764217070672 - } - } - }, - "Qwen2_VL_2B": { - "skills": { - "Object Recognition and Classification": { - "count": 303, - "num_samples": 4755, - "tasks": [], - "average_score": 0.22236161923122505 - }, - "Text Recognition (OCR)": { - "count": 137, - "num_samples": 2239, - "tasks": [], - "average_score": 0.23701014663017753 - }, - "Language Understanding and Generation": { - "count": 154, - "num_samples": 2509, - "tasks": [], - "average_score": 0.25669221785292334 - }, - "Scene and Event Understanding": { - "count": 154, - "num_samples": 2467, - "tasks": [], - "average_score": 0.26526414975225454 - }, - "Mathematical and Logical Reasoning": { - "count": 109, - "num_samples": 1910, - "tasks": [], - "average_score": 0.17623548305581763 - }, - "Commonsense and Social Reasoning": { - "count": 51, - "num_samples": 855, - "tasks": [], - "average_score": 0.31250702198481506 - }, - "Ethical and Safety Reasoning": { - "count": 15, - "num_samples": 245, - "tasks": [], - "average_score": 0.4140676691729323 - }, - "Domain-Specific Knowledge and Skills": { - "count": 77, - "num_samples": 1386, - "tasks": [], - "average_score": 0.20802820480076603 - }, - "Spatial and Temporal Reasoning": { - "count": 152, - "num_samples": 2437, - "tasks": [], - "average_score": 0.17320633068307653 - }, - "Planning and Decision Making": { - "count": 37, - "num_samples": 577, - "tasks": [], - "average_score": 0.06209506566980099 - } - }, - "input_format": { - "User Interface Screenshots": { - "count": 93, - "num_samples": 1517, - "tasks": [], - "average_score": 0.190837839372028 - }, - "Text-Based Images and Documents": { - "count": 82, - "num_samples": 1294, - "tasks": [], - "average_score": 0.16287824421269087 - }, - "Diagrams and Data Visualizations": { - "count": 101, - "num_samples": 1718, - "tasks": [], - "average_score": 0.19640906475019812 - }, - "Videos": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.2520741776922928 - }, - "Artistic and Creative Content": { - "count": 32, - "num_samples": 541, - "tasks": [], - "average_score": 0.24883076673424442 - }, - "Photographs": { - "count": 143, - "num_samples": 2248, - "tasks": [], - "average_score": 0.2877316297453947 - }, - "3D Models and Aerial Imagery": { - "count": 11, - "num_samples": 169, - "tasks": [], - "average_score": 0.13398525561847363 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 98, - "num_samples": 1514, - "tasks": [], - "average_score": 0.1624451002757208 - }, - "structured_output": { - "count": 110, - "num_samples": 1714, - "tasks": [], - "average_score": 0.20960092816529263 - }, - "exact_text": { - "count": 83, - "num_samples": 1278, - "tasks": [], - "average_score": 0.19986806708136184 - }, - "numerical_data": { - "count": 49, - "num_samples": 862, - "tasks": [], - "average_score": 0.2201024015934558 - }, - "open_ended_output": { - "count": 80, - "num_samples": 1454, - "tasks": [], - "average_score": 0.30248748033122763 - }, - "multiple_choice": { - "count": 85, - "num_samples": 1363, - "tasks": [], - "average_score": 0.256631742010999 - } - }, - "input_num": { - "6-8 images": { - "count": 21, - "num_samples": 314, - "tasks": [], - "average_score": 0.07681405895691609 - }, - "9-image or more": { - "count": 41, - "num_samples": 623, - "tasks": [], - "average_score": 0.10526691703628158 - }, - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.25018977062352593 - }, - "video": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.2520741776922928 - }, - "4-5 images": { - "count": 34, - "num_samples": 520, - "tasks": [], - "average_score": 0.17435940889565366 - }, - "2-3 images": { - "count": 51, - "num_samples": 802, - "tasks": [], - "average_score": 0.21286783416184518 - } - }, - "app": { - "Information_Extraction": { - "count": 72, - "num_samples": 1124, - "tasks": [], - "average_score": 0.2521972668785968 - }, - "Planning": { - "count": 78, - "num_samples": 1239, - "tasks": [], - "average_score": 0.06967138760493456 - }, - "Coding": { - "count": 31, - "num_samples": 474, - "tasks": [], - "average_score": 0.16996250112948405 - }, - "Perception": { - "count": 145, - "num_samples": 2313, - "tasks": [], - "average_score": 0.27603334911345223 - }, - "Metrics": { - "count": 20, - "num_samples": 309, - "tasks": [], - "average_score": 0.31002436092347696 - }, - "Science": { - "count": 29, - "num_samples": 574, - "tasks": [], - "average_score": 0.21061929716065056 - }, - "Knowledge": { - "count": 97, - "num_samples": 1605, - "tasks": [], - "average_score": 0.2656728023444808 - }, - "Mathematics": { - "count": 33, - "num_samples": 547, - "tasks": [], - "average_score": 0.16356158787929762 - } - } - }, - "Aquila_VL_2B": { - "skills": { - "Object Recognition and Classification": { - "count": 303, - "num_samples": 4755, - "tasks": [], - "average_score": 0.18420666660337692 - }, - "Text Recognition (OCR)": { - "count": 137, - "num_samples": 2239, - "tasks": [], - "average_score": 0.12395530240359122 - }, - "Language Understanding and Generation": { - "count": 154, - "num_samples": 2509, - "tasks": [], - "average_score": 0.17924536722051596 - }, - "Scene and Event Understanding": { - "count": 154, - "num_samples": 2467, - "tasks": [], - "average_score": 0.220108610660707 - }, - "Mathematical and Logical Reasoning": { - "count": 109, - "num_samples": 1910, - "tasks": [], - "average_score": 0.1680749869910155 - }, - "Commonsense and Social Reasoning": { - "count": 51, - "num_samples": 855, - "tasks": [], - "average_score": 0.26630477322766793 - }, - "Ethical and Safety Reasoning": { - "count": 15, - "num_samples": 245, - "tasks": [], - "average_score": 0.35152130325814535 - }, - "Domain-Specific Knowledge and Skills": { - "count": 77, - "num_samples": 1386, - "tasks": [], - "average_score": 0.1857154485444521 - }, - "Spatial and Temporal Reasoning": { - "count": 152, - "num_samples": 2437, - "tasks": [], - "average_score": 0.1616397700608881 - }, - "Planning and Decision Making": { - "count": 37, - "num_samples": 577, - "tasks": [], - "average_score": 0.044513236949565 - } - }, - "input_format": { - "User Interface Screenshots": { - "count": 93, - "num_samples": 1517, - "tasks": [], - "average_score": 0.07480350331940272 - }, - "Text-Based Images and Documents": { - "count": 82, - "num_samples": 1294, - "tasks": [], - "average_score": 0.11444110320621242 - }, - "Diagrams and Data Visualizations": { - "count": 101, - "num_samples": 1718, - "tasks": [], - "average_score": 0.19412275574929044 - }, - "Videos": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.21367350061199514 - }, - "Artistic and Creative Content": { - "count": 32, - "num_samples": 541, - "tasks": [], - "average_score": 0.19717811128156643 - }, - "Photographs": { - "count": 143, - "num_samples": 2248, - "tasks": [], - "average_score": 0.24620947964695974 - }, - "3D Models and Aerial Imagery": { - "count": 11, - "num_samples": 169, - "tasks": [], - "average_score": 0.10131259529340846 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 98, - "num_samples": 1514, - "tasks": [], - "average_score": 0.11925340914357861 - }, - "structured_output": { - "count": 110, - "num_samples": 1714, - "tasks": [], - "average_score": 0.123417109500157 - }, - "exact_text": { - "count": 83, - "num_samples": 1278, - "tasks": [], - "average_score": 0.18474924824567768 - }, - "numerical_data": { - "count": 49, - "num_samples": 862, - "tasks": [], - "average_score": 0.19908864029107046 - }, - "open_ended_output": { - "count": 80, - "num_samples": 1454, - "tasks": [], - "average_score": 0.23278612647548963 - }, - "multiple_choice": { - "count": 85, - "num_samples": 1363, - "tasks": [], - "average_score": 0.22108484223035305 - } - }, - "input_num": { - "6-8 images": { - "count": 21, - "num_samples": 314, - "tasks": [], - "average_score": 0.11057256235827662 - }, - "9-image or more": { - "count": 41, - "num_samples": 623, - "tasks": [], - "average_score": 0.011631871744697361 - }, - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.18240049845355885 - }, - "video": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.21367350061199514 - }, - "4-5 images": { - "count": 34, - "num_samples": 520, - "tasks": [], - "average_score": 0.1898373110613516 - }, - "2-3 images": { - "count": 51, - "num_samples": 802, - "tasks": [], - "average_score": 0.23274180707905315 - } - }, - "app": { - "Information_Extraction": { - "count": 72, - "num_samples": 1124, - "tasks": [], - "average_score": 0.09484068019620011 - }, - "Planning": { - "count": 78, - "num_samples": 1239, - "tasks": [], - "average_score": 0.05864269260897992 - }, - "Coding": { - "count": 31, - "num_samples": 474, - "tasks": [], - "average_score": 0.13323092677931386 - }, - "Perception": { - "count": 145, - "num_samples": 2313, - "tasks": [], - "average_score": 0.20714098741611 - }, - "Metrics": { - "count": 20, - "num_samples": 309, - "tasks": [], - "average_score": 0.2932627505936196 - }, - "Science": { - "count": 29, - "num_samples": 574, - "tasks": [], - "average_score": 0.21075421274487907 - }, - "Knowledge": { - "count": 97, - "num_samples": 1605, - "tasks": [], - "average_score": 0.24110595572817994 - }, - "Mathematics": { - "count": 33, - "num_samples": 547, - "tasks": [], - "average_score": 0.20711160718581811 - } - } - }, - "Mammoth_VL": { - "skills": { - "Object Recognition and Classification": { - "count": 303, - "num_samples": 4755, - "tasks": [], - "average_score": 0.30194776127683565 - }, - "Text Recognition (OCR)": { - "count": 137, - "num_samples": 2239, - "tasks": [], - "average_score": 0.2365295791606494 - }, - "Language Understanding and Generation": { - "count": 154, - "num_samples": 2509, - "tasks": [], - "average_score": 0.2993927028494267 - }, - "Scene and Event Understanding": { - "count": 154, - "num_samples": 2467, - "tasks": [], - "average_score": 0.3366347826116991 - }, - "Mathematical and Logical Reasoning": { - "count": 109, - "num_samples": 1910, - "tasks": [], - "average_score": 0.2408454736444444 - }, - "Commonsense and Social Reasoning": { - "count": 51, - "num_samples": 855, - "tasks": [], - "average_score": 0.37895522991264047 - }, - "Ethical and Safety Reasoning": { - "count": 15, - "num_samples": 245, - "tasks": [], - "average_score": 0.48003508771929826 - }, - "Domain-Specific Knowledge and Skills": { - "count": 77, - "num_samples": 1386, - "tasks": [], - "average_score": 0.27232427744946475 - }, - "Spatial and Temporal Reasoning": { - "count": 152, - "num_samples": 2437, - "tasks": [], - "average_score": 0.24522937191710698 - }, - "Planning and Decision Making": { - "count": 37, - "num_samples": 577, - "tasks": [], - "average_score": 0.11457024299726488 - } - }, - "input_format": { - "User Interface Screenshots": { - "count": 93, - "num_samples": 1517, - "tasks": [], - "average_score": 0.18941525254390731 - }, - "Text-Based Images and Documents": { - "count": 82, - "num_samples": 1294, - "tasks": [], - "average_score": 0.1718334741390191 - }, - "Diagrams and Data Visualizations": { - "count": 101, - "num_samples": 1718, - "tasks": [], - "average_score": 0.28108187023954245 - }, - "Videos": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.3391119999611432 - }, - "Artistic and Creative Content": { - "count": 32, - "num_samples": 541, - "tasks": [], - "average_score": 0.36434285930327387 - }, - "Photographs": { - "count": 143, - "num_samples": 2248, - "tasks": [], - "average_score": 0.36915384448504296 - }, - "3D Models and Aerial Imagery": { - "count": 11, - "num_samples": 169, - "tasks": [], - "average_score": 0.15940750469262005 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 98, - "num_samples": 1514, - "tasks": [], - "average_score": 0.2456942956200745 - }, - "structured_output": { - "count": 110, - "num_samples": 1714, - "tasks": [], - "average_score": 0.21586513216389874 - }, - "exact_text": { - "count": 83, - "num_samples": 1278, - "tasks": [], - "average_score": 0.29359048024032264 - }, - "numerical_data": { - "count": 49, - "num_samples": 862, - "tasks": [], - "average_score": 0.2646677074112521 - }, - "open_ended_output": { - "count": 80, - "num_samples": 1454, - "tasks": [], - "average_score": 0.34733130661096645 - }, - "multiple_choice": { - "count": 85, - "num_samples": 1363, - "tasks": [], - "average_score": 0.3286125236284589 - } - }, - "input_num": { - "6-8 images": { - "count": 21, - "num_samples": 314, - "tasks": [], - "average_score": 0.16358654572940287 - }, - "9-image or more": { - "count": 41, - "num_samples": 623, - "tasks": [], - "average_score": 0.25463059203015115 - }, - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.2919119209789575 - }, - "video": { - "count": 43, - "num_samples": 698, - "tasks": [], - "average_score": 0.3391119999611432 - }, - "4-5 images": { - "count": 34, - "num_samples": 520, - "tasks": [], - "average_score": 0.20016011839130254 - }, - "2-3 images": { - "count": 51, - "num_samples": 802, - "tasks": [], - "average_score": 0.2679179451692527 - } - }, - "app": { - "Information_Extraction": { - "count": 72, - "num_samples": 1124, - "tasks": [], - "average_score": 0.23600902063965679 - }, - "Planning": { - "count": 78, - "num_samples": 1239, - "tasks": [], - "average_score": 0.15326915093278803 - }, - "Coding": { - "count": 31, - "num_samples": 474, - "tasks": [], - "average_score": 0.20668466311255687 - }, - "Perception": { - "count": 145, - "num_samples": 2313, - "tasks": [], - "average_score": 0.33348955971237954 - }, - "Metrics": { - "count": 20, - "num_samples": 309, - "tasks": [], - "average_score": 0.3759170425350556 - }, - "Science": { - "count": 29, - "num_samples": 574, - "tasks": [], - "average_score": 0.23894961766260706 - }, - "Knowledge": { - "count": 97, - "num_samples": 1605, - "tasks": [], - "average_score": 0.351703435685048 - }, - "Mathematics": { - "count": 33, - "num_samples": 547, - "tasks": [], - "average_score": 0.26074348700688493 - } - } - } -} \ No newline at end of file diff --git a/static/eval_results/Default/all_summary.json b/static/eval_results/Default/all_summary.json deleted file mode 100644 index e82fc987307418940619aab4a3a374e74c684b19..0000000000000000000000000000000000000000 --- a/static/eval_results/Default/all_summary.json +++ /dev/null @@ -1,525 +0,0 @@ -{ - "GPT_4o": { - "core_noncot": { - "num_eval_tasks": 440, - "num_eval_samples": 6539, - "num_not_eval_samples": 0, - "macro_mean_score": 0.5203440930873326, - "micro_mean_score": 0.514302640282204 - }, - "core_cot": { - "num_eval_tasks": 440, - "num_eval_samples": 6539, - "num_not_eval_samples": 0, - "macro_mean_score": 0.5265030595065238, - "micro_mean_score": 0.5236338521693411 - }, - "open": { - "num_eval_tasks": 65, - "num_eval_samples": 1163, - "macro_mean_score": 0.6478225794744895, - "micro_mean_score": 0.665391229578676 - }, - "overall_score": 0.5421184432647768 - }, - "Gemini_1.5_pro_002": { - "core_noncot": { - "num_eval_tasks": 440, - "num_eval_samples": 6539, - "num_not_eval_samples": 0, - "macro_mean_score": 0.4699992918320008, - "micro_mean_score": 0.4651116133689296 - }, - "core_cot": { - "num_eval_tasks": 440, - "num_eval_samples": 6539, - "num_not_eval_samples": 0, - "macro_mean_score": 0.4822473962867704, - "micro_mean_score": 0.4764805563057179 - }, - "open": { - "num_eval_tasks": 65, - "num_eval_samples": 1163, - "macro_mean_score": 0.5858190649927173, - "micro_mean_score": 0.6104901117798793 - }, - "overall_score": 0.4955784031499121 - }, - "Gemini_1.5_flash_002": { - "core_noncot": { - "num_eval_tasks": 440, - "num_eval_samples": 6539, - "num_not_eval_samples": 0, - "macro_mean_score": 0.41898948981774853, - "micro_mean_score": 0.4127376993779598 - }, - "core_cot": { - "num_eval_tasks": 440, - "num_eval_samples": 6539, - "num_not_eval_samples": 0, - "macro_mean_score": 0.4189319021967416, - "micro_mean_score": 0.41567515414375245 - }, - "open": { - "num_eval_tasks": 65, - "num_eval_samples": 1163, - "macro_mean_score": 0.5691365176285039, - "micro_mean_score": 0.5987532244196045 - }, - "overall_score": 0.43831534488249924 - }, - "Claude_3.5": { - "core_noncot": { - "num_eval_tasks": 440, - "num_eval_samples": 6539, - "num_not_eval_samples": 0, - "macro_mean_score": 0.48800427486796155, - "micro_mean_score": 0.4814327812005499 - }, - "core_cot": { - "num_eval_tasks": 440, - "num_eval_samples": 6539, - "num_not_eval_samples": 0, - "macro_mean_score": 0.5040975742801586, - "micro_mean_score": 0.5002259116666758 - }, - "open": { - "num_eval_tasks": 65, - "num_eval_samples": 1163, - "macro_mean_score": 0.6373907158949892, - "micro_mean_score": 0.6569647463456579 - }, - "overall_score": 0.5212541172602853 - }, - "Claude_3.5_new": { - "core_noncot": { - "num_eval_tasks": 440, - "num_eval_samples": 6539, - "num_not_eval_samples": 0, - "macro_mean_score": 0.4919657684484185, - "micro_mean_score": 0.4874520567007144 - }, - "core_cot": { - "num_eval_tasks": 440, - "num_eval_samples": 6539, - "num_not_eval_samples": 0, - "macro_mean_score": 0.5259191914020757, - "micro_mean_score": 0.5230785894131227 - }, - "open": { - "num_eval_tasks": 65, - "num_eval_samples": 1163, - "macro_mean_score": 0.6563419761104125, - "micro_mean_score": 0.6724419604471196 - }, - "overall_score": 0.5427062825031487 - }, - "GPT_4o_mini": { - "core_noncot": { - "num_eval_tasks": 440, - "num_eval_samples": 6539, - "num_not_eval_samples": 0, - "macro_mean_score": 0.39854757130003565, - "micro_mean_score": 0.3936551517403452 - }, - "core_cot": { - "num_eval_tasks": 440, - "num_eval_samples": 6539, - "num_not_eval_samples": 0, - "macro_mean_score": 0.40767494558789397, - "micro_mean_score": 0.40431644154143376 - }, - "open": { - "num_eval_tasks": 65, - "num_eval_samples": 1163, - "macro_mean_score": 0.586537827213665, - "micro_mean_score": 0.6133276010318144 - }, - "overall_score": 0.43069690064863675 - }, - "Qwen2_VL_72B": { - "core_noncot": { - "num_eval_tasks": 440, - "num_eval_samples": 6539, - "num_not_eval_samples": 0, - "macro_mean_score": 0.46406654108789214, - "micro_mean_score": 0.4584702152011697 - }, - "core_cot": { - "num_eval_tasks": 440, - "num_eval_samples": 6539, - "num_not_eval_samples": 0, - "macro_mean_score": 0.4542376574527161, - "micro_mean_score": 0.4501201906164793 - }, - "open": { - "num_eval_tasks": 65, - "num_eval_samples": 1163, - "macro_mean_score": 0.5639771804231668, - "micro_mean_score": 0.5835339638865004 - }, - "overall_score": 0.4769263263488681 - }, - "Qwen2_VL_7B": { - "core_noncot": { - "num_eval_tasks": 440, - "num_eval_samples": 6539, - "num_not_eval_samples": 0, - "macro_mean_score": 0.3480020832611913, - "micro_mean_score": 0.3441858958345098 - }, - "core_cot": { - "num_eval_tasks": 440, - "num_eval_samples": 6539, - "num_not_eval_samples": 0, - "macro_mean_score": 0.3293449599230247, - "micro_mean_score": 0.325331493515679 - }, - "open": { - "num_eval_tasks": 65, - "num_eval_samples": 1170, - "macro_mean_score": 0.43955105763038577, - "micro_mean_score": 0.45508547008546996 - }, - "overall_score": 0.3597856146156421 - }, - "llava_onevision_72B": { - "core_noncot": { - "num_eval_tasks": 440, - "num_eval_samples": 6539, - "num_not_eval_samples": 0, - "macro_mean_score": 0.3199332158220174, - "micro_mean_score": 0.31770770553892647 - }, - "core_cot": { - "num_eval_tasks": 440, - "num_eval_samples": 6539, - "num_not_eval_samples": 0, - "macro_mean_score": 0.2974368415462532, - "micro_mean_score": 0.2956217833156672 - }, - "open": { - "num_eval_tasks": 65, - "num_eval_samples": 1163, - "macro_mean_score": 0.4599484231632498, - "micro_mean_score": 0.4850386930352536 - }, - "overall_score": 0.33795497518277007 - }, - "llava_onevision_7B": { - "core_noncot": { - "num_eval_tasks": 440, - "num_eval_samples": 6539, - "num_not_eval_samples": 0, - "macro_mean_score": 0.22409531510496777, - "micro_mean_score": 0.22238854298563537 - }, - "core_cot": { - "num_eval_tasks": 440, - "num_eval_samples": 6539, - "num_not_eval_samples": 0, - "macro_mean_score": 0.21362697219149712, - "micro_mean_score": 0.21073910058505504 - }, - "open": { - "num_eval_tasks": 65, - "num_eval_samples": 1163, - "macro_mean_score": 0.33979975321921935, - "micro_mean_score": 0.36474634565778147 - }, - "overall_score": 0.23898796555531696 - }, - "InternVL2_76B": { - "core_noncot": { - "num_eval_tasks": 440, - "num_eval_samples": 6539, - "num_not_eval_samples": 0, - "macro_mean_score": 0.3502244283768534, - "micro_mean_score": 0.3456783051732046 - }, - "core_cot": { - "num_eval_tasks": 440, - "num_eval_samples": 6539, - "num_not_eval_samples": 0, - "macro_mean_score": 0.3562710424410931, - "micro_mean_score": 0.35129859801162616 - }, - "open": { - "num_eval_tasks": 65, - "num_eval_samples": 1163, - "macro_mean_score": 0.5192997443033639, - "micro_mean_score": 0.5421324161650903 - }, - "overall_score": 0.3772549347599992 - }, - "InternVL2_8B": { - "core_noncot": { - "num_eval_tasks": 440, - "num_eval_samples": 6539, - "num_not_eval_samples": 0, - "macro_mean_score": 0.25956581776451815, - "micro_mean_score": 0.2546984460483302 - }, - "core_cot": { - "num_eval_tasks": 440, - "num_eval_samples": 6539, - "num_not_eval_samples": 0, - "macro_mean_score": 0.24090301358258295, - "micro_mean_score": 0.23819084111520938 - }, - "open": { - "num_eval_tasks": 65, - "num_eval_samples": 1165, - "macro_mean_score": 0.3978571701460552, - "micro_mean_score": 0.4108583690987125 - }, - "overall_score": 0.2773656948037259 - }, - "MiniCPM_v2.6": { - "core_noncot": { - "num_eval_tasks": 440, - "num_eval_samples": 6539, - "num_not_eval_samples": 0, - "macro_mean_score": 0.2287645706203155, - "micro_mean_score": 0.2249087742955901 - }, - "core_cot": { - "num_eval_tasks": 440, - "num_eval_samples": 6539, - "num_not_eval_samples": 0, - "macro_mean_score": 0.22955895202146906, - "micro_mean_score": 0.22560399396899078 - }, - "open": { - "num_eval_tasks": 65, - "num_eval_samples": 1163, - "macro_mean_score": 0.41728623355613875, - "micro_mean_score": 0.43452278589853827 - }, - "overall_score": 0.2537218694467236 - }, - "Phi-3.5-vision": { - "core_noncot": { - "num_eval_tasks": 440, - "num_eval_samples": 6539, - "num_not_eval_samples": 0, - "macro_mean_score": 0.23271251159409778, - "micro_mean_score": 0.2296262323791101 - }, - "core_cot": { - "num_eval_tasks": 440, - "num_eval_samples": 6539, - "num_not_eval_samples": 0, - "macro_mean_score": 0.22995297916629392, - "micro_mean_score": 0.22708502951025372 - }, - "open": { - "num_eval_tasks": 65, - "num_eval_samples": 1163, - "macro_mean_score": 0.3947914647737769, - "micro_mean_score": 0.42459157351676696 - }, - "overall_score": 0.25357415903306635 - }, - "Pixtral_12B": { - "core_noncot": { - "num_eval_tasks": 440, - "num_eval_samples": 6539, - "num_not_eval_samples": 0, - "macro_mean_score": 0.31905695620134694, - "micro_mean_score": 0.31556607913724777 - }, - "core_cot": { - "num_eval_tasks": 440, - "num_eval_samples": 6539, - "num_not_eval_samples": 0, - "macro_mean_score": 0.31362045151669854, - "micro_mean_score": 0.3100986209078182 - }, - "open": { - "num_eval_tasks": 65, - "num_eval_samples": 1163, - "macro_mean_score": 0.4566234428542061, - "micro_mean_score": 0.4870593293207223 - }, - "overall_score": 0.33676353369131895 - }, - "Llama_3_2_11B": { - "core_noncot": { - "num_eval_tasks": 440, - "num_eval_samples": 6539, - "num_not_eval_samples": 0, - "macro_mean_score": 0.10044261716549671, - "micro_mean_score": 0.09980638766828835 - }, - "core_cot": { - "num_eval_tasks": 440, - "num_eval_samples": 6539, - "num_not_eval_samples": 0, - "macro_mean_score": 0.15999641916771298, - "micro_mean_score": 0.15809331016967038 - }, - "open": { - "num_eval_tasks": 65, - "num_eval_samples": 1163, - "macro_mean_score": 0.3173342406187366, - "micro_mean_score": 0.3487962166809973 - }, - "overall_score": 0.1802478219287358 - }, - "Idefics3": { - "core_noncot": { - "num_eval_tasks": 440, - "num_eval_samples": 6539, - "num_not_eval_samples": 0, - "macro_mean_score": 0.11118980301103833, - "micro_mean_score": 0.11201785633274061 - }, - "core_cot": { - "num_eval_tasks": 440, - "num_eval_samples": 6539, - "num_not_eval_samples": 0, - "macro_mean_score": 0.08956972487602757, - "micro_mean_score": 0.08982225274252693 - }, - "open": { - "num_eval_tasks": 65, - "num_eval_samples": 1163, - "macro_mean_score": 0.3210866162255635, - "micro_mean_score": 0.35649183147033553 - }, - "overall_score": 0.138206224513898 - }, - "Aria": { - "core_noncot": { - "num_eval_tasks": 440, - "num_eval_samples": 6539, - "num_not_eval_samples": 0, - "macro_mean_score": 0.30485930718699694, - "micro_mean_score": 0.3016713629035311 - }, - "core_cot": { - "num_eval_tasks": 440, - "num_eval_samples": 6539, - "num_not_eval_samples": 0, - "macro_mean_score": 0.289073788209904, - "micro_mean_score": 0.2859007507765791 - }, - "open": { - "num_eval_tasks": 65, - "num_eval_samples": 1163, - "macro_mean_score": 0.5103725263180767, - "micro_mean_score": 0.5349957007738607 - }, - "overall_score": 0.3313115037088191 - }, - "NVLM": { - "core_noncot": { - "num_eval_tasks": 440, - "num_eval_samples": 6539, - "num_not_eval_samples": 0, - "macro_mean_score": 0.2420528895703979, - "micro_mean_score": 0.23838419989257642 - }, - "core_cot": { - "num_eval_tasks": 440, - "num_eval_samples": 6539, - "num_not_eval_samples": 0, - "macro_mean_score": 0.21589726765847422, - "micro_mean_score": 0.21406043849932396 - }, - "open": { - "num_eval_tasks": 65, - "num_eval_samples": 1163, - "macro_mean_score": 0.3478114310231307, - "micro_mean_score": 0.3947549441100602 - }, - "overall_score": 0.25566537510391796 - }, - "InternVL2_2B": { - "core_noncot": { - "num_eval_tasks": 440, - "num_eval_samples": 6539, - "num_not_eval_samples": 0, - "macro_mean_score": 0.09089701489596874, - "micro_mean_score": 0.09036328295381871 - }, - "core_cot": { - "num_eval_tasks": 440, - "num_eval_samples": 6539, - "num_not_eval_samples": 0, - "macro_mean_score": 0.13141974398938763, - "micro_mean_score": 0.13063500716262516 - }, - "open": { - "num_eval_tasks": 65, - "num_eval_samples": 1163, - "macro_mean_score": 0.23864417043743646, - "micro_mean_score": 0.24901117798796224 - }, - "overall_score": 0.14522090778963154 - }, - "Qwen2_VL_2B": { - "core_noncot": { - "num_eval_tasks": 440, - "num_eval_samples": 6539, - "num_not_eval_samples": 0, - "macro_mean_score": 0.16448220309703876, - "micro_mean_score": 0.1610710186451323 - }, - "core_cot": { - "num_eval_tasks": 440, - "num_eval_samples": 6539, - "num_not_eval_samples": 0, - "macro_mean_score": 0.20877163406364055, - "micro_mean_score": 0.20561526268932287 - }, - "open": { - "num_eval_tasks": 65, - "num_eval_samples": 1163, - "macro_mean_score": 0.3154302566225611, - "micro_mean_score": 0.33856405846947557 - }, - "overall_score": 0.22249997162072932 - }, - "Aquila_VL_2B": { - "core_noncot": { - "num_eval_tasks": 440, - "num_eval_samples": 6539, - "num_not_eval_samples": 0, - "macro_mean_score": 0.16317824309838627, - "micro_mean_score": 0.16198837245148487 - }, - "core_cot": { - "num_eval_tasks": 440, - "num_eval_samples": 6539, - "num_not_eval_samples": 0, - "macro_mean_score": 0.159970161379836, - "micro_mean_score": 0.15844711671722148 - }, - "open": { - "num_eval_tasks": 65, - "num_eval_samples": 1163, - "macro_mean_score": 0.24567572098570653, - "micro_mean_score": 0.2704213241616509 - }, - "overall_score": 0.17379673035120966 - }, - "Mammoth_VL": { - "core_noncot": { - "num_eval_tasks": 440, - "num_eval_samples": 6539, - "num_not_eval_samples": 0, - "macro_mean_score": 0.264052880412689, - "micro_mean_score": 0.2626894374387823 - }, - "core_cot": null, - "open": { - "num_eval_tasks": 65, - "num_eval_samples": 1163, - "macro_mean_score": 0.37992668750165337, - "micro_mean_score": 0.40120378331900275 - }, - "overall_score": 0.27896733083008046 - } -} \ No newline at end of file diff --git a/static/eval_results/Default/llava_onevision_72B/summary_results.json b/static/eval_results/Default/llava_onevision_72B/summary_results.json new file mode 100644 index 0000000000000000000000000000000000000000..2eb71da75405e6141add0c6e95de67741daab5e6 --- /dev/null +++ b/static/eval_results/Default/llava_onevision_72B/summary_results.json @@ -0,0 +1,251 @@ +{ + "model_summary": { + "core": { + "num_eval_tasks": 440, + "num_eval_samples": 6539, + "macro_mean_score": 0.2974368415462532, + "micro_mean_score": 0.2956217833156672 + }, + "open": { + "num_eval_tasks": 65, + "num_eval_samples": 1163, + "macro_mean_score": 0.4599484231632498, + "micro_mean_score": 0.4850386930352536 + }, + "overall_score": 0.31835417383358944 + }, + "keyword_stats": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.3305832092026115 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.2664116432811501 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.3495276153952721 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.39896965542882173 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.2861655413017371 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.45858638429470816 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.5934010025062657 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.30604427435146236 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.27899574672445293 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.12433347702554473 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.21351320454567943 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.21295992410688594 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.32763074938212144 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.39619210332031635 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.43323889670054355 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.39984139901797444 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.20740773655402334 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.25996561636037274 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.25556145878894343 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.3399610538914775 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.2976261136565818 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.42431325255029934 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.3580583490549799 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.19965041572184428 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.22764372137050506 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.33685775371860216 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.39619210332031635 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.22870674199032645 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.32000636054527115 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.2754908385554327 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.1617746235264615 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.24538794012228551 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.37170152595100986 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.40489633872843234 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.2903113179276 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.41431490471877547 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.30623783684939326 + } + } + } +} \ No newline at end of file diff --git a/static/eval_results/Default/llava_onevision_72B/task_results.json b/static/eval_results/Default/llava_onevision_72B/task_results.json new file mode 100644 index 0000000000000000000000000000000000000000..0a83dc338e703fdbe458d83b59a8a58a872050a1 --- /dev/null +++ b/static/eval_results/Default/llava_onevision_72B/task_results.json @@ -0,0 +1,7756 @@ +[ + { + "name": "face_identity_matching", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "4-5 images" + }, + { + "name": "IAM_line_ocr_and_locate", + "score": 0.4694874961506745, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "science_molecule_chemistry", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "weather_info_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "signboard_identification", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "av_multicamera_tracking_predict_bbox", + "score": 0.0013130777246558834, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "9-image or more" + }, + { + "name": "funsd_document_qa", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "physical_property_reasoning", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "geometry_area", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geometry_analytic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "map_diagram_qa", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "clevr_arithmetic", + "score": 0.42105263157894735, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "graph_connectivity", + "score": 0.2833333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "visualdial_visual_dialog_image_guessing", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "graph_isomorphism", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "clevrer_object_existence_video", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "question_solution_solving", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "snli_ve_visual_entailment", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_chemistry", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ili_ratio_future_prediction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "image_style_recognition", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "multilingual_movie_info_parsing", + "score": 0.10204081632653059, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "star_object_interaction_video", + "score": 0.5625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "video_content_reasoning", + "score": 0.2222222222222222, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "geometry_transformation", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "places365_similar_scene_retrieval", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "vibe_eval_short_phrase", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "cam_traj_to_video_selection", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "math_breakpoint", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "photo_sharing_image_retrieval", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "perception_test_video_character_order", + "score": 0.1875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "coco_ood_global_image_retrieval_by_query_property", + "score": 0.4063775510204081, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "arc_agi", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "vizwiz_quality_accessment_for_blind", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "av_human_multiview_counting", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "6-8 images" + }, + { + "name": "waybill_number_sequence_extraction", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "emotion_recognition", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "6-8 images" + }, + { + "name": "brand_logo_recognition_and_elaboration", + "score": 0.7, + "eval_type": "rule", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "license_plate_recognition", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chess_puzzle_single_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_programming_test_easy", + "score": 0.20833333333333334, + "eval_type": "rule", + "num_demo": 1, + "num_query": 24, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_winner_identification", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chart_vqa", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "math_convexity_value_estimation", + "score": 0.45686454871884236, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "3d_indoor_scene_text_bbox_prediction", + "score": 0.10277361750041909, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "long_string_letter_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "dvqa", + "score": 0.5263157894736842, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "egocentric_spatial_reasoning", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "music_info_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "quizlet_question_solving", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "coco_person_detection", + "score": 0.5647397973618976, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "clevrer_moving_direction_video", + "score": 0.125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "graph_maxflow", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "game_info_parsing", + "score": 0.4545454545454545, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_hamiltonian_cycle", + "score": 0.26071428571428573, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "av_vehicle_multiview_counting", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "6-8 images" + }, + { + "name": "animal_pose_estimation", + "score": 0.21063161048981988, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "humor_understand_caption_match", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "super_clevr", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "average_humidity_estimate_plot", + "score": 0.554, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "nlvr2_two_image_compare_qa", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "sta_action_localization_video", + "score": 0.3125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "face_keypoint_detection", + "score": 0.7673845718816646, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "insect_order_classification", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "deciphering_oracle_bone", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "places365_scene_type_classification", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "science_basic_physics", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "perception_test_object_shuffle_video", + "score": 0.4375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "long_string_number_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_programming_test_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "video_to_camera_trajectory_retrieval", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "cheapest_flight_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "newspaper_ocr_in_query_box", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "stock_info_retrieval", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "graph_theory", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "landmark_recognition_and_qa", + "score": 0.48888888888888893, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "muma_theory_of_mind_social_goal", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "clevrer_video_moving_object_property_recognition", + "score": 0.875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "action_sequence", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "9-image or more" + }, + { + "name": "game_info_retrieval", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "av_view_identification", + "score": 0.17777777777777776, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "code_programming_test_advanced", + "score": 0.05555555555555555, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "muma_theory_of_mind_belief_of_goal", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "code_programming_extremely_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Planning and Decision Making" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "geometry_solid", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "top_rated_hotel_identification", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "stock_info_parsing", + "score": 0.12605042016806725, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "pmc_vqa_medical_image_qa", + "score": 0.5263157894736842, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "3d_fragments_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "electricity_future_prediction_from_table", + "score": 0.6391210526315788, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "TV_show_info_parsing", + "score": 0.30158730158730157, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "coco_object_detection_by_query_property", + "score": 0.5830473772319825, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "vlnqa_egocentric_navigation_video", + "score": 0.3125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "landmark_check_two_images", + "score": 0.8222222222222223, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "figureqa", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_biology", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "traffic_future_prediction_from_line_plot", + "score": 0.6007894736842104, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "mvsa_sentiment_classification", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "stock_price_future_prediction", + "score": 0.529357142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multi_load_type_prediction_from_plot", + "score": 0.4523809523809523, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "hotel_booking_confirmation_parsing", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_shortest_path_planar", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "action_prediction", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "code_error_line_identification", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "math_parity", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "famous_building_recognition", + "score": 0.78125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "multilingual_game_info_parsing", + "score": 0.15178571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "weather_info_parsing", + "score": 0.496031746031746, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_shortest_path_kamada_kawai", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "widerface_face_count_and_event_classification", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "nextqa_mc", + "score": 0.9473684210526315, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "ti_fused_vqa_physics", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "movie_info_retrieval", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "graph_chordless_cycle", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "clevrer_video_moving_object_count", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 21, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "video" + }, + { + "name": "geometry_length", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "algebra", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "human_relationship_reasoning", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Commonsense and Social Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "exchange_rate_estimate_plot", + "score": 0.8263285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "electricity_load_estimate_plot", + "score": 0.559142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "logo2k_same_type_logo_retrieval", + "score": 0.39285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "single_person_pose_estimation", + "score": 0.21560071069920497, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_math", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "electricity_plot_future_prediction", + "score": 0.4171368421052631, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "graph_hamiltonian_path", + "score": 0.4017857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "iconqa", + "score": 0.5263157894736842, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "movie_info_parsing", + "score": 0.19642857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "youtube_video_info_parsing", + "score": 0.047619047619047616, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "egocentric_analysis_single_image", + "score": 0.2222222222222222, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "perception_test_video_action_count", + "score": 0.5625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "video" + }, + { + "name": "music_info_parsing", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "mnist_pattern", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "newspaper_page_parse_and_count", + "score": 0.5777777777777777, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_descriptive", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_relation", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_depth", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_distance", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_count", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geographic_remote_sensing_land_cover", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "symbolic_graphics_programs_computer_aided_design", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "symbolic_graphics_programs_scalable_vector_graphics", + "score": 0.1111111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_asia", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_americas", + "score": 0.11428571428571428, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_europe", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_africa", + "score": 0.014285714285714287, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "visual_prediction_rater_plane_segmentation", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_novel_view_synthesis", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_3d_assembled_quality_understanding", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "visual_prediction_rater_semantic_segmentation", + "score": 0.29166666666666663, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_panoptic_segmentation", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_surface_normal_estimation", + "score": 0.5476190476190477, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_depth_estimation", + "score": 0.15476190476190474, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_openable_part_segmentation", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "medical_abdomen_MRI_organ_recognition", + "score": 0.047619047619047616, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "visual_correspondance_in_two_images", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "google_streetview_circle_reasoning", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "vln_tegulu_next_step", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "flowchart_code_generation", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "video_eval_factual_pref", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "medical_polyp_segmentation_single_object_rater", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "recipe_image_ordering", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "healthcare_info_judgement", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Ethical and Safety Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "relative_depth_of_different_points", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "planning_visual_storage", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_visualization_output_understanding", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 10, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "Ad_count_detection", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "TRANCE_physics_reasoning_view", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "video_grounding_temporal", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "chess_sygyzy_endgames", + "score": 0.07748064743138636, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_visual_blocksworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "web_action_grounding", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "position_relationship", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Web_Multi", + "score": 0.4642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "webpage_code_understanding", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "vln_english_next_step", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "logical_reasoning_2d_folding", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "extract_webpage_headline", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_translation_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "music_sheet_note_count", + "score": 0.058823529411764705, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_circled_letter", + "score": 0.4642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_find_odd_one_out", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "paper_vqa", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "medical_abdomen_endscopy_organ_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "worldle", + "score": 0.1675305866800247, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "medical_keywords_based_retrieval_non_radiology", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "remaining_playback_time_calculation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "sign_language", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "video_camera_motion_description", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "exact_text", + "num_input": "video" + }, + { + "name": "video_segments_reordering", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "structured_output", + "num_input": "video" + }, + { + "name": "interpret_force_perspective_illusion", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Web_Single", + "score": 0.007919756956365811, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "hashtag_recommendation", + "score": 0.8761904761904761, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_tyreworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_grounding_spatial", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "weather_map_climate_type_temperature_parsing", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_match_problem", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "music_sheet_sentiment", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "scibench_calculus_wo_solution", + "score": 0.12244897959183673, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_eval_visual_pref", + "score": 0.0625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "image_translation_en2cn", + "score": 0.21479662373299105, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "game_platform_support_identification", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "music_sheet_format_QA", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "CLEVRER_physics", + "score": 0.35, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "location_vqa", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "semantic_matching_of_two_images", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "planning_visual_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "geometry_reasoning_nested_squares", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "circuit_diagram_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "scibench_fundamental_wo_solution", + "score": 0.04081632653061224, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_visual_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "planning_visual_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "code_retrieval", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "medical_blood_vessels_recognition", + "score": 0.5357142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_cell_recognition", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_blocksworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_storage", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "distinguish_ai_generated_image", + "score": 0.8421052631578947, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_count_line_intersections", + "score": 0.4642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_grippers", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "booking_web_recommendation", + "score": 0.3497448979591837, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "photoshop_operation", + "score": 0.2642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "llavaguard", + "score": 0.6785714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "media_homepage_profile", + "score": 0.011904761904761904, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "vln_hindi_next_step", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "media_recommend_solutions_stackoverflow", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "dish_ingredient_match", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "medical_retrieval_given_surgeon_activity", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Videos", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "google_streetview_direction_understanding", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "music_sheet_name", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "code_translation_easy", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "ancient_map_understanding", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "mindmap_elements_parsing", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "autonomous_driving_scene_analysis", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_2D_views_of_3D_shapes", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "paper_review_acceptance", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "rocks_samples_compare", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "google_streetview_line_reasoning", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "orchestra_score_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "highest_discount_game_price_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "video_intent_recognition", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "counting", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "media_QA_web_stackoverflow", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_solution_compare", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "soccer_offside", + "score": 0.2222222222222222, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "2d_image_jigsaw_puzzle_easy", + "score": 0.18095238095238098, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "medical_content_based_retrieval_radiology", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "song_title_identification_from_lyrics", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MMMU_pro_exam_screenshot", + "score": 0.15151515151515152, + "eval_type": "rule", + "num_demo": 1, + "num_query": 99, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "multiview_reasoning_camera_moving", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "medical_multi_organ_segmentation_rater", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "knowledge_graph_understanding", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "rocks_samples_identify", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "mensa_iq_test", + "score": 0.3406862745098039, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "play_go_capture_stone", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "booking_web_rating", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "web_action_prediction", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "top_video_creator_identification", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_visual_grippers", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "annoying_word_search", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Mobile_tap", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "comic_page_ordering", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "geometry_reasoning_grid", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "TRANCE_physics_reasoning_event", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "planning_screenshot_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Mobile_swipe", + "score": 0.6019887092978411, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "recover_masked_word_in_figure", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "vln_identify_robot", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "9-image or more" + }, + { + "name": "multilingual_news_qa", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "move_pos_to_pos_hanoi_4_pole", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "relative_reflectance_of_different_regions", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "counting_multi_image", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "code_translation_advanced", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "medical_image_artifacts_indentification", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "signage_navigation", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "transit_map_intersection_points", + "score": 0.2619047619047619, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_execution", + "score": 0.1875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "icon_arithmetic_puzzle", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_overlapped_circle", + "score": 0.6071428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "google_streetview_circle_sorting", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "chinese_idiom_recognition", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "knowledge_sign_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "monthly_weather_days_count", + "score": 0.3095238095238096, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "calendar_schedule_suggestion", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "music_sheet_author", + "score": 0.1875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "ascii_art_understanding", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "chess_find_legal_moves", + "score": 0.040960740741354244, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "topological_sort", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "entertainment_web_game_style", + "score": 0.5357142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_action_recognition", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "structured_output", + "num_input": "video" + }, + { + "name": "functionality_matching_in_different_objects", + "score": 0.39285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "Movie_retrieval_by_actor", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "TV_show_retrieval_by_character", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "Forensic_Detection_of_different_images", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "polygon_interior_angles", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "pokemon_3D_recognition", + "score": 0.23333333333333334, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "mahjong", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "font_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "paper_review_rating", + "score": 0.6370339174257883, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "numerical_data", + "num_input": "4-5 images" + }, + { + "name": "number_comparison", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "medical_counting_lymphocytes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "actor_recognition_in_Movie", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "TRANCE_physics_reasoning_basic", + "score": 0.5882352941176471, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "waldo", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "google_streetview_line_sorting", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "video_eval_dynamic_pref", + "score": 0.1875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "product_ocr_qa", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "Bongard_Problem", + "score": 0.3157894736842105, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_translation_Python", + "score": 0.0625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "logical_reasoning_fit_pattern", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "road_map_find_highway_between_two_place", + "score": 0.4117647058823529, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ishihara_test", + "score": 0.4000000000000001, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_parasite_detection", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "LaTeX_complex_formula_convertion", + "score": 0.17647058823529413, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "vln_identify_location", + "score": 0.08484848484848483, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "rebus", + "score": 0.21739130434782608, + "eval_type": "rule", + "num_demo": 1, + "num_query": 23, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "constrained_generation_multi_contain_repeat_position_only_length", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_multi_contain_repeat", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_position_length", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_xor_images", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_position_images", + "score": 0.6, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_repeat_length", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_contain_length", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_multi_contain_position_only", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_contain_images", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_length", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_equality", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_crushing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_checkmate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "panel_images_single_question", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "panel_images_multi_question", + "score": 0.5952380952380951, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_tiktok", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_notes", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_alipay", + "score": 0.35294117647058826, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_amazon", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_instagram", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_ppt", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_leetcode", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_twitter", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_zoom", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_iphone_settings", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_youtube", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_excel", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_word", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "crossword_mini_5x5", + "score": 0.028571428571428574, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MMSoc_HatefulMemes", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_GossipCop", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Object Recognition and Classification", + "Scene and Event Understanding", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_PolitiFact", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Memotion", + "score": 0.635294117647059, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "reward_models_I2T_reward", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "reward_models_T2I_reward", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "ocr_math_MATH", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "ocr_math_TheoremQA", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "counterfactual_arithmetic", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_homepage", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_publication", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_blogpost", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "table_understanding_complex_question_answering", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "table_understanding_fact_verification", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_swap", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_style", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_clip_stable_diffusion_generate", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_veracity", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_attribute_edit", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_entity_replace", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_out_of_context", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_background_change", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "video_motion_matching_3D_real", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "video_motion_matching_real_3D", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "number_puzzle_kakuro_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "number_puzzle_sudoku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ball_cup_swap_3", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "maze_2d_8x8", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "shape_composition_shapes", + "score": 0.36981292517006803, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "shape_composition_colours", + "score": 0.28985260770975046, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_chinese_celebrity", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_indian_celebrity", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_papers", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_famous_treaty", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_csv", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_school_plain", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_text_latex", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_skill_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_experience_plain", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_article_authors", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_markdown", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_html", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_employer_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_equation", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_article_journal", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_latex", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "autorater_subject", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_artifact", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "autorater_control", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_aesthetics", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_mask", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_artifact_reason", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "autorater_semantics", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_motion_guided_editing", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "autorater_3d_model_texturing", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Metrics", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_unmask", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "poetry_haiku", + "score": 0.4, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_limerick", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_petrarchian_sonnet_optional_meter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_custom_rhyming_scheme", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic_alliteration", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_shakespearean_sonnet", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "pictionary_cartoon_drawing_guess", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_doodle_guess", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_skribbl_io", + "score": 0.1, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_genai_output_chinese", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_chinese_food_img2en", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_tiktok", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_alipay", + "score": 0.29411764705882354, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_amazon", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_instagram", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_ppt", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_leetcode", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_twitter", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_zoom", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_iphone_settings", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_youtube", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_excel", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_word", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "tqa_textbook_qa", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_output_result", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "kvqa_knowledge_aware_qa", + "score": 0.42105263157894735, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "cultural_vqa", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "character_recognition_in_TV_shows", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "3d_indoor_scene_text_bbox_selection", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "table_understanding", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "handwritten_math_expression_extraction", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MMMU_physics_chemistry_MCQ", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_add_tag", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "realworld_qa_en2cn", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "arxiv_vqa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "painting_QA", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "app_layout_understanding_notes", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "scibench_w_solution_open_ended", + "score": 0.248, + "eval_type": "llm", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "electrocardiogram", + "score": 0.32142857142857145, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funny_image_title", + "score": 0.6571428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "image_captioning_with_additional_requirements", + "score": 0.8428571428571431, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "meme_explain", + "score": 0.5571428571428572, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "image_humor_understanding", + "score": 0.6448275862068966, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "doc_vqa", + "score": 0.4937499999999999, + "eval_type": "llm", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "4-5 images" + }, + { + "name": "generated_video_artifacts", + "score": 0.29375, + "eval_type": "llm", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "sceneqa_scene_transition_video", + "score": 0.29999999999999993, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "nextqa_oe", + "score": 0.31052631578947365, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "tweets_captioning", + "score": 0.45714285714285713, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "defeasible_reasoning", + "score": 0.6931034482758622, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "vibe-eval", + "score": 0.5214285714285715, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "ocrqa", + "score": 0.6000000000000001, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_content_follow_up", + "score": 0.7642857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Planning and Decision Making" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "figurative_speech_explanation", + "score": 0.7310344827586207, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "wikihow_complex_task_completion", + "score": 0.4444444444444444, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "9-image or more" + }, + { + "name": "docci_image_description_long", + "score": 0.6714285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_detail_description", + "score": 0.4842105263157894, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "video2notes", + "score": 0.33571428571428574, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "funqa_unexpected_action_magic_video", + "score": 0.5333333333333333, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "video_qa", + "score": 0.6214285714285716, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "bar_chart_interpretation", + "score": 0.4827586206896552, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_summary", + "score": 0.5714285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "iq_test", + "score": 0.36551724137931035, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funqa_unexpected_action_creative_video", + "score": 0.24666666666666667, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "unusual_images", + "score": 0.6827586206896553, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "GUI_Chat_Hard", + "score": 0.4193548387096775, + "eval_type": "llm", + "num_demo": 1, + "num_query": 31, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "graph_interpretation", + "score": 0.4551724137931035, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "traffic_accident_analysis", + "score": 0.6857142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "humor_explanation", + "score": 0.46, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funqa_unexpected_action_humor_video", + "score": 0.23333333333333334, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "video_short_title", + "score": 0.6785714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "GUI_Chat_Easy", + "score": 0.6346153846153846, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "table2latex_complex", + "score": 0.41111111111111115, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "activitynetqa", + "score": 0.46842105263157896, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "visualization_with_code", + "score": 0.4071428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "science_figure_explanation", + "score": 0.6413793103448276, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "guess_image_generation_prompt", + "score": 0.8157894736842105, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "art_explanation", + "score": 0.5517241379310345, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "paper_review_writing", + "score": 0.42666666666666664, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "4-5 images" + }, + { + "name": "bridge_strategies_worldclass", + "score": 0.10714285714285716, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_expert", + "score": 0.4, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_advanced", + "score": 0.2, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Spanish", + "score": 0.07857142857142858, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Arabic", + "score": 0.11428571428571432, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Japanese", + "score": 0.09285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Russian", + "score": 0.049999999999999996, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_English", + "score": 0.11428571428571431, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_French", + "score": 0.07142857142857142, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "table_understanding_fetaqa", + "score": 0.40714285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visual_order_B", + "score": 0.905263157894737, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "red_teaming_racial", + "score": 0.745, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_captcha", + "score": 0.10000000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_celebrity", + "score": 0.5700000000000001, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visual_order_A", + "score": 0.905263157894737, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "red_teaming_politics", + "score": 0.6950000000000001, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visualmisleading", + "score": 0.8631578947368422, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_jailbreak", + "score": 0.805, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Text Recognition (OCR)", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Spanish", + "score": 0.2714285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Arabic", + "score": 0.33571428571428574, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Russian", + "score": 0.3071428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_French", + "score": 0.2857142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Chinese", + "score": 0.3071428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "ascii_art_30", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + } +] \ No newline at end of file diff --git a/static/eval_results/Default/llava_onevision_7B/summary_results.json b/static/eval_results/Default/llava_onevision_7B/summary_results.json new file mode 100644 index 0000000000000000000000000000000000000000..28bba3867965ba53739f13ae6e96aa8e53be256d --- /dev/null +++ b/static/eval_results/Default/llava_onevision_7B/summary_results.json @@ -0,0 +1,251 @@ +{ + "model_summary": { + "core": { + "num_eval_tasks": 440, + "num_eval_samples": 6539, + "macro_mean_score": 0.21362697219149712, + "micro_mean_score": 0.21073910058505504 + }, + "open": { + "num_eval_tasks": 65, + "num_eval_samples": 1163, + "macro_mean_score": 0.33979975321921935, + "micro_mean_score": 0.36474634565778147 + }, + "overall_score": 0.2298670331158574 + }, + "keyword_stats": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.24537135448488254 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.1811965364419926 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.24900339991899337 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.29226125591371144 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.18715552665467763 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.35893459469741823 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.4261779448621554 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.23519232289471675 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.21092208834236795 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.05120126248868793 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.12311222499137182 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.13426264370971033 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.23906185495910173 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.2849009636494337 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.3190474713712686 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.3199649328875728 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.11485315822870823 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.1918463706234823 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.1576113794541456 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.25634975084939915 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.20701788214046113 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.3110674582207745 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.2780982759930128 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.15675547996976566 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.1264959299900332 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.24529675139199647 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.2849009636494337 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.18774519614511564 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.22945291608261006 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.17072269450773786 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.0916060531149872 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.13896832926719074 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.2764008849458886 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.3730746617295249 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.24378647619197266 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.31371933170977295 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.2211309948951352 + } + } + } +} \ No newline at end of file diff --git a/static/eval_results/Default/llava_onevision_7B/task_results.json b/static/eval_results/Default/llava_onevision_7B/task_results.json new file mode 100644 index 0000000000000000000000000000000000000000..ca9b89fa27cd44459fa62b05606715ba8cea739a --- /dev/null +++ b/static/eval_results/Default/llava_onevision_7B/task_results.json @@ -0,0 +1,7756 @@ +[ + { + "name": "face_identity_matching", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "4-5 images" + }, + { + "name": "IAM_line_ocr_and_locate", + "score": 0.18393003523735163, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "science_molecule_chemistry", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "weather_info_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "signboard_identification", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "av_multicamera_tracking_predict_bbox", + "score": 0.034222739980969856, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "9-image or more" + }, + { + "name": "funsd_document_qa", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "physical_property_reasoning", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "geometry_area", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geometry_analytic", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "map_diagram_qa", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "clevr_arithmetic", + "score": 0.3157894736842105, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "graph_connectivity", + "score": 0.16666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "visualdial_visual_dialog_image_guessing", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "graph_isomorphism", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "clevrer_object_existence_video", + "score": 0.5625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "question_solution_solving", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "snli_ve_visual_entailment", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_chemistry", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ili_ratio_future_prediction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "image_style_recognition", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "multilingual_movie_info_parsing", + "score": 0.1530612244897959, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "star_object_interaction_video", + "score": 0.3125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "video_content_reasoning", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "geometry_transformation", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "places365_similar_scene_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "vibe_eval_short_phrase", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "cam_traj_to_video_selection", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "math_breakpoint", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "photo_sharing_image_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "perception_test_video_character_order", + "score": 0.125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "coco_ood_global_image_retrieval_by_query_property", + "score": 0.23841991341991345, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "arc_agi", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "vizwiz_quality_accessment_for_blind", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "av_human_multiview_counting", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "6-8 images" + }, + { + "name": "waybill_number_sequence_extraction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "emotion_recognition", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "6-8 images" + }, + { + "name": "brand_logo_recognition_and_elaboration", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "license_plate_recognition", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chess_puzzle_single_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_programming_test_easy", + "score": 0.08333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 24, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_winner_identification", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chart_vqa", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "math_convexity_value_estimation", + "score": 0.3180487534871889, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "3d_indoor_scene_text_bbox_prediction", + "score": 0.07094776572587472, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "long_string_letter_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "dvqa", + "score": 0.47368421052631576, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "egocentric_spatial_reasoning", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "music_info_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "quizlet_question_solving", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "coco_person_detection", + "score": 0.27730527983349595, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "clevrer_moving_direction_video", + "score": 0.0625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "graph_maxflow", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "game_info_parsing", + "score": 0.3246753246753246, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_hamiltonian_cycle", + "score": 0.33035714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "av_vehicle_multiview_counting", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "6-8 images" + }, + { + "name": "animal_pose_estimation", + "score": 0.21124366266226283, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "humor_understand_caption_match", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "super_clevr", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "average_humidity_estimate_plot", + "score": 0.3466666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "nlvr2_two_image_compare_qa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "sta_action_localization_video", + "score": 0.1875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "face_keypoint_detection", + "score": 0.848854419078294, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "insect_order_classification", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "deciphering_oracle_bone", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "places365_scene_type_classification", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "science_basic_physics", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "perception_test_object_shuffle_video", + "score": 0.1875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "long_string_number_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_programming_test_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "video_to_camera_trajectory_retrieval", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "cheapest_flight_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "newspaper_ocr_in_query_box", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "stock_info_retrieval", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "graph_theory", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "landmark_recognition_and_qa", + "score": 0.3111111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "muma_theory_of_mind_social_goal", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "clevrer_video_moving_object_property_recognition", + "score": 0.6875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "action_sequence", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "9-image or more" + }, + { + "name": "game_info_retrieval", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "av_view_identification", + "score": 0.12222222222222222, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "code_programming_test_advanced", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "muma_theory_of_mind_belief_of_goal", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "code_programming_extremely_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Planning and Decision Making" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "geometry_solid", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "top_rated_hotel_identification", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "stock_info_parsing", + "score": 0.04201680672268908, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "pmc_vqa_medical_image_qa", + "score": 0.5789473684210527, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "3d_fragments_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "electricity_future_prediction_from_table", + "score": 0.39, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "TV_show_info_parsing", + "score": 0.3174603174603175, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "coco_object_detection_by_query_property", + "score": 0.19287649674503982, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "vlnqa_egocentric_navigation_video", + "score": 0.125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "landmark_check_two_images", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "figureqa", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_biology", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "traffic_future_prediction_from_line_plot", + "score": 0.4274736842105263, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "mvsa_sentiment_classification", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "stock_price_future_prediction", + "score": 0.23392857142857157, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multi_load_type_prediction_from_plot", + "score": 0.19047619047619047, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "hotel_booking_confirmation_parsing", + "score": 0.021428571428571432, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_shortest_path_planar", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "action_prediction", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "code_error_line_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "math_parity", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "famous_building_recognition", + "score": 0.375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "multilingual_game_info_parsing", + "score": 0.13392857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "weather_info_parsing", + "score": 0.24206349206349206, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_shortest_path_kamada_kawai", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "widerface_face_count_and_event_classification", + "score": 0.17857142857142858, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "nextqa_mc", + "score": 0.8421052631578947, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "ti_fused_vqa_physics", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "movie_info_retrieval", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "graph_chordless_cycle", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "clevrer_video_moving_object_count", + "score": 0.19047619047619047, + "eval_type": "rule", + "num_demo": 1, + "num_query": 21, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "video" + }, + { + "name": "geometry_length", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "algebra", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "human_relationship_reasoning", + "score": 0.875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Commonsense and Social Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "exchange_rate_estimate_plot", + "score": 0.36628571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "electricity_load_estimate_plot", + "score": 0.3349285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "logo2k_same_type_logo_retrieval", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "single_person_pose_estimation", + "score": 0.04435358813204807, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_math", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "electricity_plot_future_prediction", + "score": 0.3337526315789474, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "graph_hamiltonian_path", + "score": 0.4107142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "iconqa", + "score": 0.42105263157894735, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "movie_info_parsing", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "youtube_video_info_parsing", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "egocentric_analysis_single_image", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "perception_test_video_action_count", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "video" + }, + { + "name": "music_info_parsing", + "score": 0.13392857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "mnist_pattern", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "newspaper_page_parse_and_count", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_descriptive", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_relation", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_depth", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_distance", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_count", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geographic_remote_sensing_land_cover", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "symbolic_graphics_programs_computer_aided_design", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "symbolic_graphics_programs_scalable_vector_graphics", + "score": 0.16666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_asia", + "score": 0.014285714285714287, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_americas", + "score": 0.11428571428571428, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_europe", + "score": 0.042857142857142864, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_africa", + "score": 0.028571428571428574, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "visual_prediction_rater_plane_segmentation", + "score": 0.2222222222222222, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_novel_view_synthesis", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_3d_assembled_quality_understanding", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "visual_prediction_rater_semantic_segmentation", + "score": 0.0625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_panoptic_segmentation", + "score": 0.09523809523809523, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_surface_normal_estimation", + "score": 0.23809523809523808, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_depth_estimation", + "score": 0.17857142857142858, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_prediction_rater_openable_part_segmentation", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "visual_correspondance_in_two_images", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "google_streetview_circle_reasoning", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "vln_tegulu_next_step", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "flowchart_code_generation", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "video_eval_factual_pref", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "medical_polyp_segmentation_single_object_rater", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "recipe_image_ordering", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "healthcare_info_judgement", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Ethical and Safety Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "relative_depth_of_different_points", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "planning_visual_storage", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_visualization_output_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 10, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "Ad_count_detection", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "TRANCE_physics_reasoning_view", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "video_grounding_temporal", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "chess_sygyzy_endgames", + "score": 0.09455782312925169, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_visual_blocksworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "web_action_grounding", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "position_relationship", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Web_Multi", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "webpage_code_understanding", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "vln_english_next_step", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "logical_reasoning_2d_folding", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "extract_webpage_headline", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_translation_hard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "music_sheet_note_count", + "score": 0.058823529411764705, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_circled_letter", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_find_odd_one_out", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "paper_vqa", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "worldle", + "score": 0.007154015950258795, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "medical_keywords_based_retrieval_non_radiology", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "remaining_playback_time_calculation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "sign_language", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "video_camera_motion_description", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "exact_text", + "num_input": "video" + }, + { + "name": "video_segments_reordering", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "structured_output", + "num_input": "video" + }, + { + "name": "interpret_force_perspective_illusion", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Web_Single", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "hashtag_recommendation", + "score": 0.8642857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_tyreworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_grounding_spatial", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "weather_map_climate_type_temperature_parsing", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_match_problem", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "music_sheet_sentiment", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "scibench_calculus_wo_solution", + "score": 0.02040816326530612, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_eval_visual_pref", + "score": 0.5625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "image_translation_en2cn", + "score": 0.09394266841882398, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "game_platform_support_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "music_sheet_format_QA", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "CLEVRER_physics", + "score": 0.45, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "location_vqa", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "semantic_matching_of_two_images", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "planning_visual_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "geometry_reasoning_nested_squares", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "circuit_diagram_understanding", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "scibench_fundamental_wo_solution", + "score": 0.10204081632653061, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_visual_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "planning_visual_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "code_retrieval", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "planning_screenshot_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_blocksworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_storage", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "distinguish_ai_generated_image", + "score": 0.6842105263157895, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_count_line_intersections", + "score": 0.5357142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_grippers", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "booking_web_recommendation", + "score": 0.3779761904761904, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "photoshop_operation", + "score": 0.19047619047619047, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "llavaguard", + "score": 0.4642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "media_homepage_profile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "vln_hindi_next_step", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "media_recommend_solutions_stackoverflow", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "dish_ingredient_match", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "google_streetview_direction_understanding", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "music_sheet_name", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "code_translation_easy", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "ancient_map_understanding", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "mindmap_elements_parsing", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "autonomous_driving_scene_analysis", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_2D_views_of_3D_shapes", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "paper_review_acceptance", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "rocks_samples_compare", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "google_streetview_line_reasoning", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "orchestra_score_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "highest_discount_game_price_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "video_intent_recognition", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "video" + }, + { + "name": "counting", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "media_QA_web_stackoverflow", + "score": 0.38095238095238093, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_solution_compare", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "soccer_offside", + "score": 0.2222222222222222, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "2d_image_jigsaw_puzzle_easy", + "score": 0.2678571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "medical_content_based_retrieval_radiology", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "song_title_identification_from_lyrics", + "score": 0.10714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MMMU_pro_exam_screenshot", + "score": 0.10101010101010101, + "eval_type": "rule", + "num_demo": 1, + "num_query": 99, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "multiview_reasoning_camera_moving", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "medical_multi_organ_segmentation_rater", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "knowledge_graph_understanding", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "rocks_samples_identify", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "mensa_iq_test", + "score": 0.058823529411764705, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "play_go_capture_stone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "booking_web_rating", + "score": 0.10714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "web_action_prediction", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "top_video_creator_identification", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_visual_grippers", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "annoying_word_search", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Mobile_tap", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "comic_page_ordering", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "6-8 images" + }, + { + "name": "geometry_reasoning_grid", + "score": 0.10714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "TRANCE_physics_reasoning_event", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "2-3 images" + }, + { + "name": "planning_screenshot_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Mobile_swipe", + "score": 0.552269841333154, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "recover_masked_word_in_figure", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "vln_identify_robot", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "9-image or more" + }, + { + "name": "multilingual_news_qa", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "move_pos_to_pos_hanoi_4_pole", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "relative_reflectance_of_different_regions", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "counting_multi_image", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "2-3 images" + }, + { + "name": "code_translation_advanced", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "signage_navigation", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "transit_map_intersection_points", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_execution", + "score": 0.0625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "icon_arithmetic_puzzle", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_overlapped_circle", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "google_streetview_circle_sorting", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "chinese_idiom_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "knowledge_sign_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "6-8 images" + }, + { + "name": "monthly_weather_days_count", + "score": 0.2380952380952381, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "calendar_schedule_suggestion", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "music_sheet_author", + "score": 0.0625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "ascii_art_understanding", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "chess_find_legal_moves", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "topological_sort", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "entertainment_web_game_style", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "video_action_recognition", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "structured_output", + "num_input": "video" + }, + { + "name": "functionality_matching_in_different_objects", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "Movie_retrieval_by_actor", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "TV_show_retrieval_by_character", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "9-image or more" + }, + { + "name": "Forensic_Detection_of_different_images", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "polygon_interior_angles", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "pokemon_3D_recognition", + "score": 0.1, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "mahjong", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "font_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "paper_review_rating", + "score": 0.6449456155428795, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "numerical_data", + "num_input": "4-5 images" + }, + { + "name": "number_comparison", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "actor_recognition_in_Movie", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "TRANCE_physics_reasoning_basic", + "score": 0.23529411764705882, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "waldo", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "google_streetview_line_sorting", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "9-image or more" + }, + { + "name": "video_eval_dynamic_pref", + "score": 0.375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "product_ocr_qa", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "Bongard_Problem", + "score": 0.13157894736842105, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_translation_Python", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "2-3 images" + }, + { + "name": "logical_reasoning_fit_pattern", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "road_map_find_highway_between_two_place", + "score": 0.23529411764705882, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ishihara_test", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_parasite_detection", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "structured_output", + "num_input": "6-8 images" + }, + { + "name": "LaTeX_complex_formula_convertion", + "score": 0.17647058823529413, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "vln_identify_location", + "score": 0.14545454545454545, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "structured_output", + "num_input": "4-5 images" + }, + { + "name": "rebus", + "score": 0.043478260869565216, + "eval_type": "rule", + "num_demo": 1, + "num_query": 23, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "constrained_generation_multi_contain_repeat_position_only_length", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_multi_contain_repeat", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_position_length", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_xor_images", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_position_images", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_repeat_length", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_contain_length", + "score": 0.8, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_multi_contain_position_only", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_contain_images", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "constrained_generation_contain_length", + "score": 0.4, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_equality", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_crushing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_checkmate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "panel_images_single_question", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "panel_images_multi_question", + "score": 0.2619047619047619, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "app_interactive_operations_tiktok", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_notes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_alipay", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_amazon", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_instagram", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_ppt", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_leetcode", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_twitter", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_zoom", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_iphone_settings", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_youtube", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_excel", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "app_interactive_operations_word", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "crossword_mini_5x5", + "score": 0.0071428571428571435, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MMSoc_HatefulMemes", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_GossipCop", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Object Recognition and Classification", + "Scene and Event Understanding", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_PolitiFact", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Memotion", + "score": 0.6823529411764707, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "reward_models_I2T_reward", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "reward_models_T2I_reward", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "ocr_math_MATH", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "ocr_math_TheoremQA", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "counterfactual_arithmetic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_homepage", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_publication", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_blogpost", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "table_understanding_complex_question_answering", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "table_understanding_fact_verification", + "score": 0.5952380952380951, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_swap", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_style", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_clip_stable_diffusion_generate", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_veracity", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_attribute_edit", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_entity_replace", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_out_of_context", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_background_change", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "video_motion_matching_3D_real", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "video_motion_matching_real_3D", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "number_puzzle_kakuro_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "number_puzzle_sudoku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ball_cup_swap_3", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "9-image or more" + }, + { + "name": "maze_2d_8x8", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "shape_composition_shapes", + "score": 0.19047619047619047, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "shape_composition_colours", + "score": 0.21116780045351471, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_chinese_celebrity", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_indian_celebrity", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_papers", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_famous_treaty", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_csv", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_school_plain", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_text_latex", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_skill_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_experience_plain", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_article_authors", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_markdown", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_html", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_employer_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_equation", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_article_journal", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_latex", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "autorater_subject", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_artifact", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "autorater_control", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_aesthetics", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_mask", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_artifact_reason", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "autorater_semantics", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_motion_guided_editing", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "4-5 images" + }, + { + "name": "autorater_3d_model_texturing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Metrics", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "autorater_unmask", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "2-3 images" + }, + { + "name": "poetry_haiku", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_limerick", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_petrarchian_sonnet_optional_meter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_custom_rhyming_scheme", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic_alliteration", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_shakespearean_sonnet", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "pictionary_cartoon_drawing_guess", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_doodle_guess", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_skribbl_io", + "score": 0.1, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_genai_output_chinese", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_chinese_food_img2en", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_tiktok", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_alipay", + "score": 0.058823529411764705, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_amazon", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_instagram", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_ppt", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_leetcode", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_twitter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_zoom", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_iphone_settings", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_youtube", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_excel", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_word", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "medical_abdomen_MRI_organ_recognition", + "score": 0.09523809523809523, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "medical_abdomen_endscopy_organ_recognition", + "score": 0.047619047619047616, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "4-5 images" + }, + { + "name": "medical_blood_vessels_recognition", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_cell_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "medical_retrieval_given_surgeon_activity", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Videos", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "video" + }, + { + "name": "medical_image_artifacts_indentification", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "medical_counting_lymphocytes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "tqa_textbook_qa", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_output_result", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "kvqa_knowledge_aware_qa", + "score": 0.42105263157894735, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "cultural_vqa", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "character_recognition_in_TV_shows", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "3d_indoor_scene_text_bbox_selection", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "table_understanding", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "handwritten_math_expression_extraction", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MMMU_physics_chemistry_MCQ", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_add_tag", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "2-3 images" + }, + { + "name": "realworld_qa_en2cn", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "arxiv_vqa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "painting_QA", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "4-5 images" + }, + { + "name": "app_layout_understanding_notes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "figurative_speech_explanation", + "score": 0.5620689655172414, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "tweets_captioning", + "score": 0.39285714285714285, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "science_figure_explanation", + "score": 0.3689655172413792, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "iq_test", + "score": 0.1482758620689655, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "ascii_art_30", + "score": 0.14285714285714285, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "table2latex_complex", + "score": 0.3111111111111111, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "meme_explain", + "score": 0.65, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_summary", + "score": 0.25000000000000006, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "funqa_unexpected_action_magic_video", + "score": 0.3866666666666666, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "paper_review_writing", + "score": 0.12666666666666665, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "4-5 images" + }, + { + "name": "activitynetqa", + "score": 0.38947368421052636, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "scibench_w_solution_open_ended", + "score": 0.16399999999999998, + "eval_type": "llm", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "generated_video_artifacts", + "score": 0.10000000000000002, + "eval_type": "llm", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "funny_image_title", + "score": 0.4285714285714285, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "wikihow_complex_task_completion", + "score": 0.36666666666666664, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "9-image or more" + }, + { + "name": "video_detail_description", + "score": 0.21052631578947367, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "funqa_unexpected_action_creative_video", + "score": 0.23333333333333336, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "guess_image_generation_prompt", + "score": 0.7736842105263159, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "traffic_accident_analysis", + "score": 0.45000000000000007, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_qa", + "score": 0.7, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "image_humor_understanding", + "score": 0.5206896551724137, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "vibe-eval", + "score": 0.23571428571428577, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "nextqa_oe", + "score": 0.30526315789473674, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "graph_interpretation", + "score": 0.29310344827586204, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "ocrqa", + "score": 0.5586206896551724, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_short_title", + "score": 0.3785714285714285, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "video2notes", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "electrocardiogram", + "score": 0.14285714285714285, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "GUI_Chat_Easy", + "score": 0.5500000000000002, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "doc_vqa", + "score": 0.5125000000000001, + "eval_type": "llm", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "4-5 images" + }, + { + "name": "docci_image_description_long", + "score": 0.6714285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "humor_explanation", + "score": 0.42666666666666664, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "video_content_follow_up", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Planning and Decision Making" + ], + "input_format": "Videos", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "GUI_Chat_Hard", + "score": 0.5129032258064516, + "eval_type": "llm", + "num_demo": 1, + "num_query": 31, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "unusual_images", + "score": 0.4482758620689655, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "art_explanation", + "score": 0.293103448275862, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bar_chart_interpretation", + "score": 0.3448275862068966, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funqa_unexpected_action_humor_video", + "score": 0.24666666666666667, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Videos", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "video" + }, + { + "name": "defeasible_reasoning", + "score": 0.5275862068965516, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "image_captioning_with_additional_requirements", + "score": 0.692857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "visualization_with_code", + "score": 0.21428571428571425, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "table_understanding_fetaqa", + "score": 0.25, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_jailbreak", + "score": 0.5050000000000001, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Text Recognition (OCR)", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_celebrity", + "score": 0.6450000000000002, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_captcha", + "score": 0.10000000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visual_order_B", + "score": 0.905263157894737, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "red_teaming_politics", + "score": 0.6599999999999999, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visual_order_A", + "score": 0.905263157894737, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "2-3 images" + }, + { + "name": "red_teaming_racial", + "score": 0.71, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visualmisleading", + "score": 0.910526315789474, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_worldclass", + "score": 0.07142857142857142, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_advanced", + "score": 0.12142857142857146, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_expert", + "score": 0.4214285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Spanish", + "score": 0.08571428571428573, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Russian", + "score": 0.06428571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Arabic", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_Chinese", + "score": 0.19999999999999998, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_manual_explanation_scooter_French", + "score": 0.14285714285714285, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "6-8 images" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Spanish", + "score": 0.014285714285714287, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_English", + "score": 0.021428571428571432, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Russian", + "score": 0.028571428571428574, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Arabic", + "score": 0.014285714285714287, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Japanese", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_French", + "score": 0.0071428571428571435, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "sceneqa_scene_transition_video", + "score": 0.2714285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Videos", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "video" + } +] \ No newline at end of file diff --git a/static/eval_results/SI/Aquila_VL_2B/summary_results.json b/static/eval_results/SI/Aquila_VL_2B/summary_results.json new file mode 100644 index 0000000000000000000000000000000000000000..ff246bf1cd8585833334967628ff5f37f092ebf6 --- /dev/null +++ b/static/eval_results/SI/Aquila_VL_2B/summary_results.json @@ -0,0 +1,219 @@ +{ + "model_summary": { + "core": { + "num_eval_tasks": 273, + "num_eval_samples": 4116, + "num_not_eval_samples": 0, + "macro_mean_score": 0.20770364903712493, + "micro_mean_score": 0.20333142638522636, + "missing_tasks": [] + }, + "open": { + "num_eval_tasks": 42, + "num_eval_samples": 813, + "num_not_eval_samples": 0, + "macro_mean_score": 0.31474202723571276, + "micro_mean_score": 0.3326568265682657, + "missing_tasks": [] + }, + "overall_score": 0.22197543279693666 + }, + "keyword_stats": { + "skills": { + "Text Recognition (OCR)": { + "count": 101, + "num_samples": 1687, + "tasks": [], + "average_score": 0.17480107496737848 + }, + "Object Recognition and Classification": { + "count": 172, + "num_samples": 2714, + "tasks": [], + "average_score": 0.2374462987863378 + }, + "Scene and Event Understanding": { + "count": 60, + "num_samples": 1004, + "tasks": [], + "average_score": 0.3521969849344277 + }, + "Mathematical and Logical Reasoning": { + "count": 91, + "num_samples": 1630, + "tasks": [], + "average_score": 0.19504930283108274 + }, + "Language Understanding and Generation": { + "count": 102, + "num_samples": 1713, + "tasks": [], + "average_score": 0.2521179990443663 + }, + "Domain-Specific Knowledge and Skills": { + "count": 46, + "num_samples": 897, + "tasks": [], + "average_score": 0.20221672149607509 + }, + "Spatial and Temporal Reasoning": { + "count": 78, + "num_samples": 1273, + "tasks": [], + "average_score": 0.18502360430789122 + }, + "Planning and Decision Making": { + "count": 23, + "num_samples": 356, + "tasks": [], + "average_score": 0.0625675073438388 + }, + "Commonsense and Social Reasoning": { + "count": 38, + "num_samples": 654, + "tasks": [], + "average_score": 0.3826225373137124 + }, + "Ethical and Safety Reasoning": { + "count": 10, + "num_samples": 170, + "tasks": [], + "average_score": 0.6020225563909773 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 67, + "num_samples": 1123, + "tasks": [], + "average_score": 0.11601893140078427 + }, + "Text-Based Images and Documents": { + "count": 53, + "num_samples": 847, + "tasks": [], + "average_score": 0.11430966292465267 + }, + "Photographs": { + "count": 83, + "num_samples": 1315, + "tasks": [], + "average_score": 0.3533180891172854 + }, + "Diagrams and Data Visualizations": { + "count": 88, + "num_samples": 1524, + "tasks": [], + "average_score": 0.2248398559924241 + }, + "Artistic and Creative Content": { + "count": 22, + "num_samples": 389, + "tasks": [], + "average_score": 0.3078950207372175 + }, + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.10279080594456047 + } + }, + "output_format": { + "structured_output": { + "count": 72, + "num_samples": 1121, + "tasks": [], + "average_score": 0.13944236147744013 + }, + "contextual_formatted_text": { + "count": 63, + "num_samples": 975, + "tasks": [], + "average_score": 0.1772030496280578 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.1884228017877996 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.23519563962981577 + }, + "open_ended_output": { + "count": 51, + "num_samples": 991, + "tasks": [], + "average_score": 0.28092356180071465 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.43875114784205704 + } + }, + "input_num": { + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.2219754327969366 + } + }, + "app": { + "Information_Extraction": { + "count": 41, + "num_samples": 644, + "tasks": [], + "average_score": 0.08500232938689507 + }, + "Planning": { + "count": 44, + "num_samples": 714, + "tasks": [], + "average_score": 0.08421801129956362 + }, + "Coding": { + "count": 16, + "num_samples": 244, + "tasks": [], + "average_score": 0.23446107609710548 + }, + "Perception": { + "count": 82, + "num_samples": 1321, + "tasks": [], + "average_score": 0.3004030430829456 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.39206349206349206 + }, + "Science": { + "count": 22, + "num_samples": 469, + "tasks": [], + "average_score": 0.2897054521388083 + }, + "Knowledge": { + "count": 77, + "num_samples": 1294, + "tasks": [], + "average_score": 0.2736043135287443 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.19099680045595863 + } + } + } +} \ No newline at end of file diff --git a/static/eval_results/SI/Aquila_VL_2B/task_results.json b/static/eval_results/SI/Aquila_VL_2B/task_results.json new file mode 100644 index 0000000000000000000000000000000000000000..47f26fa305c99a0ba9c008fc00a7e43c18f187ce --- /dev/null +++ b/static/eval_results/SI/Aquila_VL_2B/task_results.json @@ -0,0 +1,2207 @@ +[ + { + "name": "location_vqa", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "scibench_calculus_wo_solution", + "score": 0.08163265306122448, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49 + }, + { + "name": "logical_reasoning_find_odd_one_out", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "relative_depth_of_different_points", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "signage_navigation", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "road_map_find_highway_between_two_place", + "score": 0.29411764705882354, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "logical_reasoning_fit_pattern", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "entertainment_web_game_style", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "game_platform_support_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ascii_art_understanding", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "scibench_fundamental_wo_solution", + "score": 0.08163265306122448, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49 + }, + { + "name": "GUI_Act_Mobile_swipe", + "score": 0.20902382802982977, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "extract_webpage_headline", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "distinguish_ai_generated_image", + "score": 0.5263157894736842, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "geometry_reasoning_nested_squares", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_cell_recognition", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_image_artifacts_indentification", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_tyreworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "geometry_reasoning_overlapped_circle", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "monthly_weather_days_count", + "score": 0.2380952380952381, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "CLEVRER_physics", + "score": 0.3, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20 + }, + { + "name": "GUI_Act_Mobile_tap", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "character_recognition_in_TV_shows", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "circuit_diagram_understanding", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "music_sheet_note_count", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "MMMU_physics_chemistry_MCQ", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_find_legal_moves", + "score": 0.024221420767560734, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_visual_storage", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "medical_counting_lymphocytes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "code_execution", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "planning_screenshot_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "chess_sygyzy_endgames", + "score": 0.08027210884353742, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cultural_vqa", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "hashtag_recommendation", + "score": 0.7023809523809523, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_grid", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "annoying_word_search", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "healthcare_info_judgement", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "webpage_code_understanding", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "font_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "relative_reflectance_of_different_regions", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "calendar_schedule_suggestion", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mensa_iq_test", + "score": 0.0196078431372549, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "medical_multi_organ_segmentation_rater", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "product_ocr_qa", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "counting", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_blocksworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "MMMU_pro_exam_screenshot", + "score": 0.10101010101010101, + "eval_type": "rule", + "num_demo": 1, + "num_query": 99 + }, + { + "name": "chinese_idiom_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ishihara_test", + "score": 0.014285714285714287, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "Ad_count_detection", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "number_comparison", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "song_title_identification_from_lyrics", + "score": 0.17857142857142858, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "paper_vqa", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "polygon_interior_angles", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "play_go_capture_stone", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "mindmap_elements_parsing", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_storage", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "rebus", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 23 + }, + { + "name": "music_sheet_format_QA", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "autonomous_driving_scene_analysis", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "transit_map_intersection_points", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mahjong", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "LaTeX_complex_formula_convertion", + "score": 0.11764705882352941, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "GUI_Act_Web_Single", + "score": 0.008685714285714291, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "image_translation_en2cn", + "score": 0.02728669632537238, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "actor_recognition_in_Movie", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_blood_vessels_recognition", + "score": 0.32142857142857145, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "remaining_playback_time_calculation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "realworld_qa_en2cn", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "soccer_offside", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "recover_masked_word_in_figure", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "top_video_creator_identification", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "web_action_grounding", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "arxiv_vqa", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "interpret_force_perspective_illusion", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "orchestra_score_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "GUI_Act_Web_Multi", + "score": 0.28827816841022125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "llavaguard", + "score": 0.4642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "icon_arithmetic_puzzle", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_count_line_intersections", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "web_action_prediction", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "knowledge_graph_understanding", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "position_relationship", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "Bongard_Problem", + "score": 0.13157894736842105, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "geometry_reasoning_circled_letter", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "flowchart_code_generation", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "topological_sort", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_grippers", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "weather_map_climate_type_temperature_parsing", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "highest_discount_game_price_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "media_QA_web_stackoverflow", + "score": 0.38095238095238093, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "newspaper_ocr_in_query_box", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "electricity_plot_future_prediction", + "score": 0.23691578947368427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "coco_person_detection", + "score": 0.5319681933751316, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_length", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "weather_info_parsing", + "score": 0.12301587301587302, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "physical_property_reasoning", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "face_keypoint_detection", + "score": 0.848854419078294, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_maxflow", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "geometry_transformation", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_puzzle_single_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "ili_ratio_future_prediction", + "score": 0.08778571428571448, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_analytic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mvsa_sentiment_classification", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_theory", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "3d_indoor_scene_text_bbox_selection", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "funsd_document_qa", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "single_person_pose_estimation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "deciphering_oracle_bone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "license_plate_recognition", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "science_molecule_chemistry", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "snli_ve_visual_entailment", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "table_understanding", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_solid", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "tqa_textbook_qa", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "music_info_parsing", + "score": 0.16964285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_area", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "dvqa", + "score": 0.5789473684210527, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "widerface_face_count_and_event_classification", + "score": 0.10714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "long_string_letter_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_winner_identification", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "math_convexity_value_estimation", + "score": 0.2531585786626469, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "ti_fused_vqa_math", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "human_relationship_reasoning", + "score": 0.75, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "vibe_eval_short_phrase", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "question_solution_solving", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "game_info_parsing", + "score": 0.2272727272727272, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "top_rated_hotel_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "stock_price_future_prediction", + "score": 0.2795714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "algebra", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_shortest_path_planar", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "map_diagram_qa", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_shortest_path_kamada_kawai", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_chordless_cycle", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "waybill_number_sequence_extraction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "iconqa", + "score": 0.47368421052631576, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "graph_connectivity", + "score": 0.18333333333333332, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "mnist_pattern", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "egocentric_analysis_single_image", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "image_style_recognition", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "geometry_descriptive", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "electricity_load_estimate_plot", + "score": 0.3717857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "insect_order_classification", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "ti_fused_vqa_biology", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "youtube_video_info_parsing", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "quizlet_question_solving", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ti_fused_vqa_chemistry", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "exchange_rate_estimate_plot", + "score": 0.5177142857142858, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multilingual_movie_info_parsing", + "score": 0.11224489795918366, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "hotel_booking_confirmation_parsing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "code_programming_test_easy", + "score": 0.041666666666666664, + "eval_type": "rule", + "num_demo": 1, + "num_query": 24 + }, + { + "name": "brand_logo_recognition_and_elaboration", + "score": 0.32, + "eval_type": "rule", + "num_demo": 1, + "num_query": 25 + }, + { + "name": "movie_info_parsing", + "score": 0.16071428571428573, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "3d_indoor_scene_text_bbox_prediction", + "score": 0.0627244690319781, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "TV_show_info_parsing", + "score": 0.22222222222222218, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "stock_info_parsing", + "score": 0.03361344537815126, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "math_parity", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "famous_building_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "egocentric_spatial_reasoning", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "nlvr2_two_image_compare_qa", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_isomorphism", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "landmark_recognition_and_qa", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "handwritten_math_expression_extraction", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ti_fused_vqa_physics", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pmc_vqa_medical_image_qa", + "score": 0.6842105263157895, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "long_string_number_recognition", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "electricity_future_prediction_from_table", + "score": 0.34842105263157896, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "places365_scene_type_classification", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "super_clevr", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "animal_pose_estimation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "science_basic_physics", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "kvqa_knowledge_aware_qa", + "score": 0.47368421052631576, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "math_breakpoint", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "newspaper_page_parse_and_count", + "score": 0.28888888888888886, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "multilingual_game_info_parsing", + "score": 0.125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "signboard_identification", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "chart_vqa", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "average_humidity_estimate_plot", + "score": 0.4186666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "humor_understand_caption_match", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "figureqa", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "coco_object_detection_by_query_property", + "score": 0.5224414005921536, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "traffic_future_prediction_from_line_plot", + "score": 0.3210526315789474, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "chess_puzzles_crushing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_puzzles_equality", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "chess_puzzles_checkmate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_papers", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "memorization_indian_celebrity", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_chinese_celebrity", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_famous_treaty", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "reward_models_I2T_reward", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "crossword_mini_5x5", + "score": 0.0071428571428571435, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "number_puzzle_sudoku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "number_puzzle_kakuro_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "panel_images_single_question", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "panel_images_multi_question", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_complex_question_answering", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_fact_verification", + "score": 0.5476190476190477, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "constrained_generation_contain_repeat_length", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_contain_length", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_length", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_position_length", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "maze_2d_8x8", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Misinformation_PolitiFact", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Misinformation_GossipCop", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_HatefulMemes", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Memotion", + "score": 0.611764705882353, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "ocr_table_to_latex", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_csv", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_markdown", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_experience_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_employer_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_article_authors", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_article_journal", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_skill_plain", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_text_latex", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_equation", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_html", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_school_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_doodle_guess", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "pictionary_chinese_food_img2en", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_cartoon_drawing_guess", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_genai_output_chinese", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_skribbl_io", + "score": 0.05, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20 + }, + { + "name": "counterfactual_arithmetic", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_youtube", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_iphone_settings", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_notes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_word", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_alipay", + "score": 0.058823529411764705, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "app_layout_understanding_twitter", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_leetcode", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_tiktok", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_excel", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_instagram", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_ppt", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_zoom", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "app_layout_understanding_amazon", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_face_attribute_edit", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_out_of_context", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_background_change", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_text_entity_replace", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_clip_stable_diffusion_generate", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_text_style", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_veracity", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_face_swap", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_blogpost", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_homepage", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_publication", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "autorater_artifact_reason", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "autorater_artifact", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "shape_composition_shapes", + "score": 0.16326530612244897, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "shape_composition_colours", + "score": 0.10714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_TheoremQA", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_MATH", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "poetry_petrarchian_sonnet_optional_meter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_shakespearean_sonnet", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_custom_rhyming_scheme", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_haiku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_acrostic_alliteration", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_limerick", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_acrostic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "symbolic_graphics_programs_computer_aided_design", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "symbolic_graphics_programs_scalable_vector_graphics", + "score": 0.05555555555555555, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18 + }, + { + "name": "cvbench_adapted_cvbench_depth", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_relation", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_distance", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_count", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_africa", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_asia", + "score": 0.028571428571428574, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_americas", + "score": 0.042857142857142864, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_europe", + "score": 0.014285714285714287, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "figurative_speech_explanation", + "score": 0.4344827586206897, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "tweets_captioning", + "score": 0.37857142857142856, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "science_figure_explanation", + "score": 0.26206896551724135, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "iq_test", + "score": 0.20344827586206893, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "image_captioning_with_additional_requirements", + "score": 0.42857142857142866, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "visualization_with_code", + "score": 0.1285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocrqa", + "score": 0.4448275862068965, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "table2latex_complex", + "score": 0.3444444444444444, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "graph_interpretation", + "score": 0.23103448275862065, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "defeasible_reasoning", + "score": 0.44482758620689644, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "image_humor_understanding", + "score": 0.3931034482758621, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "bar_chart_interpretation", + "score": 0.2413793103448275, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "humor_explanation", + "score": 0.36666666666666664, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "vibe-eval", + "score": 0.3357142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "art_explanation", + "score": 0.22758620689655168, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "traffic_accident_analysis", + "score": 0.29999999999999993, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "electrocardiogram", + "score": 0.1857142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "GUI_Chat_Easy", + "score": 0.5269230769230768, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26 + }, + { + "name": "funny_image_title", + "score": 0.6071428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "scibench_w_solution_open_ended", + "score": 0.09, + "eval_type": "llm", + "num_demo": 1, + "num_query": 25 + }, + { + "name": "meme_explain", + "score": 0.35, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "GUI_Chat_Hard", + "score": 0.4451612903225806, + "eval_type": "llm", + "num_demo": 1, + "num_query": 31 + }, + { + "name": "unusual_images", + "score": 0.44482758620689666, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "guess_image_generation_prompt", + "score": 0.6631578947368422, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "docci_image_description_long", + "score": 0.32857142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ascii_art_30", + "score": 0.2857142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Spanish", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_French", + "score": 0.0071428571428571435, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Russian", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_English", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Arabic", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Japanese", + "score": 0.03571428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_fetaqa", + "score": 0.1928571428571429, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bridge_strategies_advanced", + "score": 0.15000000000000005, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bridge_strategies_expert", + "score": 0.41428571428571426, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bridge_strategies_worldclass", + "score": 0.06428571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "red_teaming_captcha", + "score": 0.10000000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "red_teaming_visualmisleading", + "score": 0.8473684210526317, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "red_teaming_celebrity", + "score": 0.5850000000000002, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_politics", + "score": 0.5999999999999999, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_jailbreak", + "score": 0.4499999999999999, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_racial", + "score": 0.6799999999999999, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + } +] \ No newline at end of file diff --git a/static/eval_results/SI/Aria/summary_results.json b/static/eval_results/SI/Aria/summary_results.json new file mode 100644 index 0000000000000000000000000000000000000000..5648c2026d713e85a8b3c03c640ec9f3a4d53c86 --- /dev/null +++ b/static/eval_results/SI/Aria/summary_results.json @@ -0,0 +1,219 @@ +{ + "model_summary": { + "core": { + "num_eval_tasks": 273, + "num_eval_samples": 4116, + "num_not_eval_samples": 0, + "macro_mean_score": 0.3178882776147889, + "micro_mean_score": 0.3101511832828904, + "missing_tasks": [] + }, + "open": { + "num_eval_tasks": 42, + "num_eval_samples": 813, + "num_not_eval_samples": 0, + "macro_mean_score": 0.5137437248005172, + "micro_mean_score": 0.5472939729397295, + "missing_tasks": [] + }, + "overall_score": 0.34400233723955265 + }, + "keyword_stats": { + "skills": { + "Text Recognition (OCR)": { + "count": 101, + "num_samples": 1687, + "tasks": [], + "average_score": 0.3653361644690575 + }, + "Object Recognition and Classification": { + "count": 172, + "num_samples": 2714, + "tasks": [], + "average_score": 0.33433893000455434 + }, + "Scene and Event Understanding": { + "count": 60, + "num_samples": 1004, + "tasks": [], + "average_score": 0.49083567506460973 + }, + "Mathematical and Logical Reasoning": { + "count": 91, + "num_samples": 1630, + "tasks": [], + "average_score": 0.300830802045758 + }, + "Language Understanding and Generation": { + "count": 102, + "num_samples": 1713, + "tasks": [], + "average_score": 0.40684369400912745 + }, + "Domain-Specific Knowledge and Skills": { + "count": 46, + "num_samples": 897, + "tasks": [], + "average_score": 0.3401734439719901 + }, + "Spatial and Temporal Reasoning": { + "count": 78, + "num_samples": 1273, + "tasks": [], + "average_score": 0.22595636868728874 + }, + "Planning and Decision Making": { + "count": 23, + "num_samples": 356, + "tasks": [], + "average_score": 0.07560632809892315 + }, + "Commonsense and Social Reasoning": { + "count": 38, + "num_samples": 654, + "tasks": [], + "average_score": 0.5240018518464876 + }, + "Ethical and Safety Reasoning": { + "count": 10, + "num_samples": 170, + "tasks": [], + "average_score": 0.7129097744360902 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 67, + "num_samples": 1123, + "tasks": [], + "average_score": 0.27807228404309764 + }, + "Text-Based Images and Documents": { + "count": 53, + "num_samples": 847, + "tasks": [], + "average_score": 0.259572791833904 + }, + "Photographs": { + "count": 83, + "num_samples": 1315, + "tasks": [], + "average_score": 0.45572004760273754 + }, + "Diagrams and Data Visualizations": { + "count": 88, + "num_samples": 1524, + "tasks": [], + "average_score": 0.3300885226603808 + }, + "Artistic and Creative Content": { + "count": 22, + "num_samples": 389, + "tasks": [], + "average_score": 0.40912566596786665 + }, + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.04960831797041802 + } + }, + "output_format": { + "structured_output": { + "count": 72, + "num_samples": 1121, + "tasks": [], + "average_score": 0.3227895527307711 + }, + "contextual_formatted_text": { + "count": 63, + "num_samples": 975, + "tasks": [], + "average_score": 0.3053148323646246 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.2579833154471113 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.3082165471908181 + }, + "open_ended_output": { + "count": 51, + "num_samples": 991, + "tasks": [], + "average_score": 0.45805038774421686 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.4787572696663607 + } + }, + "input_num": { + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.3440023372395526 + } + }, + "app": { + "Information_Extraction": { + "count": 41, + "num_samples": 644, + "tasks": [], + "average_score": 0.33746818901184633 + }, + "Planning": { + "count": 44, + "num_samples": 714, + "tasks": [], + "average_score": 0.10860172719687727 + }, + "Coding": { + "count": 16, + "num_samples": 244, + "tasks": [], + "average_score": 0.38003253384687213 + }, + "Perception": { + "count": 82, + "num_samples": 1321, + "tasks": [], + "average_score": 0.4433718463877228 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.4142857142857143 + }, + "Science": { + "count": 22, + "num_samples": 469, + "tasks": [], + "average_score": 0.3496496998103286 + }, + "Knowledge": { + "count": 77, + "num_samples": 1294, + "tasks": [], + "average_score": 0.4097428531166082 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.22745674367681176 + } + } + } +} \ No newline at end of file diff --git a/static/eval_results/SI/Aria/task_results.json b/static/eval_results/SI/Aria/task_results.json new file mode 100644 index 0000000000000000000000000000000000000000..c1be1ac46152552e5921e47545e02af483a31143 --- /dev/null +++ b/static/eval_results/SI/Aria/task_results.json @@ -0,0 +1,2207 @@ +[ + { + "name": "location_vqa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "scibench_calculus_wo_solution", + "score": 0.16326530612244897, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49 + }, + { + "name": "logical_reasoning_find_odd_one_out", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "relative_depth_of_different_points", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "signage_navigation", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "road_map_find_highway_between_two_place", + "score": 0.23529411764705882, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "logical_reasoning_fit_pattern", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "entertainment_web_game_style", + "score": 0.6785714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "game_platform_support_identification", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ascii_art_understanding", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "scibench_fundamental_wo_solution", + "score": 0.08163265306122448, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49 + }, + { + "name": "GUI_Act_Mobile_swipe", + "score": 0.4659892098786556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "extract_webpage_headline", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "distinguish_ai_generated_image", + "score": 0.5789473684210527, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "geometry_reasoning_nested_squares", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_cell_recognition", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_image_artifacts_indentification", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_tyreworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "geometry_reasoning_overlapped_circle", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "monthly_weather_days_count", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "CLEVRER_physics", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20 + }, + { + "name": "GUI_Act_Mobile_tap", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "character_recognition_in_TV_shows", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "circuit_diagram_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "music_sheet_note_count", + "score": 0.058823529411764705, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "MMMU_physics_chemistry_MCQ", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_find_legal_moves", + "score": 0.02157313400640287, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_visual_storage", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "medical_counting_lymphocytes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "code_execution", + "score": 0.1875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "planning_screenshot_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "chess_sygyzy_endgames", + "score": 0.042857142857142864, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cultural_vqa", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "hashtag_recommendation", + "score": 0.9404761904761905, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_grid", + "score": 0.10714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "annoying_word_search", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "healthcare_info_judgement", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "webpage_code_understanding", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "font_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "relative_reflectance_of_different_regions", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "calendar_schedule_suggestion", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mensa_iq_test", + "score": 0.317156862745098, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "medical_multi_organ_segmentation_rater", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "product_ocr_qa", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "counting", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_blocksworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "MMMU_pro_exam_screenshot", + "score": 0.12121212121212122, + "eval_type": "rule", + "num_demo": 1, + "num_query": 99 + }, + { + "name": "chinese_idiom_recognition", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ishihara_test", + "score": 0.17142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "Ad_count_detection", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "number_comparison", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "song_title_identification_from_lyrics", + "score": 0.10714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "paper_vqa", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "polygon_interior_angles", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "play_go_capture_stone", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "mindmap_elements_parsing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_storage", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "rebus", + "score": 0.21739130434782608, + "eval_type": "rule", + "num_demo": 1, + "num_query": 23 + }, + { + "name": "music_sheet_format_QA", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "autonomous_driving_scene_analysis", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "transit_map_intersection_points", + "score": 0.041666666666666664, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mahjong", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "LaTeX_complex_formula_convertion", + "score": 0.35294117647058826, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "GUI_Act_Web_Single", + "score": 0.001383202390173732, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "image_translation_en2cn", + "score": 0.2599548305299542, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "actor_recognition_in_Movie", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_blood_vessels_recognition", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "remaining_playback_time_calculation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "realworld_qa_en2cn", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "soccer_offside", + "score": 0.2222222222222222, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "recover_masked_word_in_figure", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "top_video_creator_identification", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "web_action_grounding", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "arxiv_vqa", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "interpret_force_perspective_illusion", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "orchestra_score_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "GUI_Act_Web_Multi", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "llavaguard", + "score": 0.6071428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "icon_arithmetic_puzzle", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_count_line_intersections", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "web_action_prediction", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "knowledge_graph_understanding", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "position_relationship", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "Bongard_Problem", + "score": 0.18421052631578946, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "geometry_reasoning_circled_letter", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "flowchart_code_generation", + "score": 0.7777777777777778, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "topological_sort", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_grippers", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "weather_map_climate_type_temperature_parsing", + "score": 0.6071428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "highest_discount_game_price_identification", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "media_QA_web_stackoverflow", + "score": 0.5238095238095238, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "newspaper_ocr_in_query_box", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "electricity_plot_future_prediction", + "score": 0.45596842105263163, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "coco_person_detection", + "score": 0.5841053655504803, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_length", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "weather_info_parsing", + "score": 0.5317460317460317, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "physical_property_reasoning", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "face_keypoint_detection", + "score": 0.848854419078294, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_maxflow", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "geometry_transformation", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_puzzle_single_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "ili_ratio_future_prediction", + "score": 0.13450000000000023, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_analytic", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mvsa_sentiment_classification", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_theory", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "3d_indoor_scene_text_bbox_selection", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "funsd_document_qa", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "single_person_pose_estimation", + "score": 0.13156684963502452, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "deciphering_oracle_bone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "license_plate_recognition", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "science_molecule_chemistry", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "snli_ve_visual_entailment", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "table_understanding", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_solid", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "tqa_textbook_qa", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "music_info_parsing", + "score": 0.3482142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_area", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "dvqa", + "score": 0.8947368421052632, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "widerface_face_count_and_event_classification", + "score": 0.39285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "long_string_letter_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_winner_identification", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "math_convexity_value_estimation", + "score": 0.33700342022200497, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "ti_fused_vqa_math", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "human_relationship_reasoning", + "score": 0.625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "vibe_eval_short_phrase", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "question_solution_solving", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "game_info_parsing", + "score": 0.8506493506493505, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "top_rated_hotel_identification", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "stock_price_future_prediction", + "score": 0.7401428571428569, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "algebra", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_shortest_path_planar", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "map_diagram_qa", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_shortest_path_kamada_kawai", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_chordless_cycle", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "waybill_number_sequence_extraction", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "iconqa", + "score": 0.42105263157894735, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "graph_connectivity", + "score": 0.15, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "mnist_pattern", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "egocentric_analysis_single_image", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "image_style_recognition", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "geometry_descriptive", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "electricity_load_estimate_plot", + "score": 0.5095714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "insect_order_classification", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "ti_fused_vqa_biology", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "youtube_video_info_parsing", + "score": 0.5952380952380952, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "quizlet_question_solving", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ti_fused_vqa_chemistry", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "exchange_rate_estimate_plot", + "score": 0.9448142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multilingual_movie_info_parsing", + "score": 0.5204081632653061, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "hotel_booking_confirmation_parsing", + "score": 0.5642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "code_programming_test_easy", + "score": 0.125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 24 + }, + { + "name": "brand_logo_recognition_and_elaboration", + "score": 0.66, + "eval_type": "rule", + "num_demo": 1, + "num_query": 25 + }, + { + "name": "movie_info_parsing", + "score": 0.5982142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "3d_indoor_scene_text_bbox_prediction", + "score": 0.027788064512264614, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "TV_show_info_parsing", + "score": 0.6984126984126985, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "stock_info_parsing", + "score": 0.8865546218487397, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "math_parity", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "famous_building_recognition", + "score": 0.78125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "egocentric_spatial_reasoning", + "score": 0.7777777777777778, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "nlvr2_two_image_compare_qa", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_isomorphism", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "landmark_recognition_and_qa", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "handwritten_math_expression_extraction", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ti_fused_vqa_physics", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pmc_vqa_medical_image_qa", + "score": 0.6842105263157895, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "long_string_number_recognition", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "electricity_future_prediction_from_table", + "score": 0.6926315789473684, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "places365_scene_type_classification", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "super_clevr", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "animal_pose_estimation", + "score": 0.21521614907043682, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "science_basic_physics", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "kvqa_knowledge_aware_qa", + "score": 0.3157894736842105, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "math_breakpoint", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "newspaper_page_parse_and_count", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "multilingual_game_info_parsing", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "signboard_identification", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "chart_vqa", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "average_humidity_estimate_plot", + "score": 0.6946666666666669, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "humor_understand_caption_match", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "figureqa", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "coco_object_detection_by_query_property", + "score": 0.5392040026539403, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "traffic_future_prediction_from_line_plot", + "score": 0.47489473684210526, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "chess_puzzles_crushing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_puzzles_equality", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "chess_puzzles_checkmate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_papers", + "score": 0.3, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "memorization_indian_celebrity", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_chinese_celebrity", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_famous_treaty", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "reward_models_I2T_reward", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "crossword_mini_5x5", + "score": 0.09285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "number_puzzle_sudoku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "number_puzzle_kakuro_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "panel_images_single_question", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "panel_images_multi_question", + "score": 0.47619047619047616, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_complex_question_answering", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_fact_verification", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "constrained_generation_contain_repeat_length", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_contain_length", + "score": 0.8, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_length", + "score": 0.4, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_position_length", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "maze_2d_8x8", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Misinformation_PolitiFact", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Misinformation_GossipCop", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_HatefulMemes", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Memotion", + "score": 0.6705882352941178, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "ocr_table_to_latex", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_csv", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_markdown", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_experience_plain", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_employer_plain", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_article_authors", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_article_journal", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_skill_plain", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_text_latex", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_equation", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_html", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_school_plain", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_doodle_guess", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "pictionary_chinese_food_img2en", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_cartoon_drawing_guess", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_genai_output_chinese", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_skribbl_io", + "score": 0.05, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20 + }, + { + "name": "counterfactual_arithmetic", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_youtube", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_iphone_settings", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_notes", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_word", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_alipay", + "score": 0.11764705882352941, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "app_layout_understanding_twitter", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_leetcode", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_tiktok", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_excel", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_instagram", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_ppt", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_zoom", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "app_layout_understanding_amazon", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_face_attribute_edit", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_out_of_context", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_background_change", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_text_entity_replace", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_clip_stable_diffusion_generate", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_text_style", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_veracity", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_face_swap", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_blogpost", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_homepage", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_publication", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "autorater_artifact_reason", + "score": 0.6, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "autorater_artifact", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "shape_composition_shapes", + "score": 0.25212585034013607, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "shape_composition_colours", + "score": 0.1272675736961451, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_TheoremQA", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_MATH", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "poetry_petrarchian_sonnet_optional_meter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_shakespearean_sonnet", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_custom_rhyming_scheme", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_haiku", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_acrostic_alliteration", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_limerick", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_acrostic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "symbolic_graphics_programs_computer_aided_design", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "symbolic_graphics_programs_scalable_vector_graphics", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18 + }, + { + "name": "cvbench_adapted_cvbench_depth", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_relation", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_distance", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_count", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_africa", + "score": 0.09999999999999999, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_asia", + "score": 0.09999999999999999, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_americas", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_europe", + "score": 0.042857142857142864, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "figurative_speech_explanation", + "score": 0.789655172413793, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "tweets_captioning", + "score": 0.45, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "science_figure_explanation", + "score": 0.6517241379310345, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "image_captioning_with_additional_requirements", + "score": 0.8571428571428573, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "visualization_with_code", + "score": 0.35000000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table2latex_complex", + "score": 0.5333333333333334, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "graph_interpretation", + "score": 0.7517241379310344, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "defeasible_reasoning", + "score": 0.7103448275862071, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "iq_test", + "score": 0.4517241379310344, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "ocrqa", + "score": 0.7724137931034484, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "image_humor_understanding", + "score": 0.7793103448275864, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "bar_chart_interpretation", + "score": 0.5724137931034483, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "humor_explanation", + "score": 0.42666666666666664, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "vibe-eval", + "score": 0.557142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "art_explanation", + "score": 0.6000000000000001, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "traffic_accident_analysis", + "score": 0.34285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "electrocardiogram", + "score": 0.2714285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "GUI_Chat_Easy", + "score": 0.7307692307692306, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26 + }, + { + "name": "funny_image_title", + "score": 0.6214285714285716, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "scibench_w_solution_open_ended", + "score": 0.21000000000000005, + "eval_type": "llm", + "num_demo": 1, + "num_query": 25 + }, + { + "name": "meme_explain", + "score": 0.6142857142857142, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "GUI_Chat_Hard", + "score": 0.44838709677419364, + "eval_type": "llm", + "num_demo": 1, + "num_query": 31 + }, + { + "name": "unusual_images", + "score": 0.6965517241379309, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "guess_image_generation_prompt", + "score": 0.805263157894737, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "docci_image_description_long", + "score": 0.6642857142857144, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ascii_art_30", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Spanish", + "score": 0.4071428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_French", + "score": 0.3214285714285715, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Russian", + "score": 0.2357142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_English", + "score": 0.43571428571428567, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Arabic", + "score": 0.4, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Japanese", + "score": 0.3, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_fetaqa", + "score": 0.5428571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bridge_strategies_advanced", + "score": 0.10714285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bridge_strategies_expert", + "score": 0.43571428571428567, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bridge_strategies_worldclass", + "score": 0.10714285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "red_teaming_captcha", + "score": 0.10000000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "red_teaming_visualmisleading", + "score": 0.910526315789474, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "red_teaming_celebrity", + "score": 0.7750000000000002, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_politics", + "score": 0.6950000000000001, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_jailbreak", + "score": 0.45, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_racial", + "score": 0.6950000000000001, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + } +] \ No newline at end of file diff --git a/static/eval_results/SI/Claude_3.5/summary_results.json b/static/eval_results/SI/Claude_3.5/summary_results.json new file mode 100644 index 0000000000000000000000000000000000000000..f8d28fc504e0065f3389cbbbf63b00505e1bcc62 --- /dev/null +++ b/static/eval_results/SI/Claude_3.5/summary_results.json @@ -0,0 +1,215 @@ +{ + "model_summary": { + "core": { + "num_eval_tasks": 273, + "num_eval_samples": 4116, + "macro_mean_score": 0.520276385877485, + "micro_mean_score": 0.5148202137998056 + }, + "open": { + "num_eval_tasks": 42, + "num_eval_samples": 813, + "macro_mean_score": 0.6479684260295507, + "micro_mean_score": 0.6801968019680197 + }, + "overall_score": 0.5373019912310938 + }, + "keyword_stats": { + "skills": { + "Text Recognition (OCR)": { + "count": 101, + "num_samples": 1687, + "tasks": [], + "average_score": 0.6192941518442948 + }, + "Object Recognition and Classification": { + "count": 172, + "num_samples": 2714, + "tasks": [], + "average_score": 0.5499261524919171 + }, + "Scene and Event Understanding": { + "count": 60, + "num_samples": 1004, + "tasks": [], + "average_score": 0.636886763741019 + }, + "Mathematical and Logical Reasoning": { + "count": 91, + "num_samples": 1630, + "tasks": [], + "average_score": 0.5044379729567133 + }, + "Language Understanding and Generation": { + "count": 102, + "num_samples": 1713, + "tasks": [], + "average_score": 0.5757222503903228 + }, + "Domain-Specific Knowledge and Skills": { + "count": 46, + "num_samples": 897, + "tasks": [], + "average_score": 0.530309401925396 + }, + "Spatial and Temporal Reasoning": { + "count": 78, + "num_samples": 1273, + "tasks": [], + "average_score": 0.4511182385296208 + }, + "Planning and Decision Making": { + "count": 23, + "num_samples": 356, + "tasks": [], + "average_score": 0.19196633042767672 + }, + "Commonsense and Social Reasoning": { + "count": 38, + "num_samples": 654, + "tasks": [], + "average_score": 0.6017116084931068 + }, + "Ethical and Safety Reasoning": { + "count": 10, + "num_samples": 170, + "tasks": [], + "average_score": 0.7033233082706767 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 67, + "num_samples": 1123, + "tasks": [], + "average_score": 0.54981020669637 + }, + "Text-Based Images and Documents": { + "count": 53, + "num_samples": 847, + "tasks": [], + "average_score": 0.4753194125515341 + }, + "Photographs": { + "count": 83, + "num_samples": 1315, + "tasks": [], + "average_score": 0.5314705050989759 + }, + "Diagrams and Data Visualizations": { + "count": 88, + "num_samples": 1524, + "tasks": [], + "average_score": 0.5589506892621444 + }, + "Artistic and Creative Content": { + "count": 22, + "num_samples": 389, + "tasks": [], + "average_score": 0.6014068374421209 + }, + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.34512576094802216 + } + }, + "output_format": { + "structured_output": { + "count": 72, + "num_samples": 1121, + "tasks": [], + "average_score": 0.5556080592390198 + }, + "contextual_formatted_text": { + "count": 63, + "num_samples": 975, + "tasks": [], + "average_score": 0.5072889926389097 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.5112348724553849 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.4712835541311676 + }, + "open_ended_output": { + "count": 51, + "num_samples": 991, + "tasks": [], + "average_score": 0.5769294912151234 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.6164633346451529 + } + }, + "input_num": { + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.5373019912310933 + } + }, + "app": { + "Information_Extraction": { + "count": 41, + "num_samples": 644, + "tasks": [], + "average_score": 0.6692574633083122 + }, + "Planning": { + "count": 44, + "num_samples": 714, + "tasks": [], + "average_score": 0.315623741632974 + }, + "Coding": { + "count": 16, + "num_samples": 244, + "tasks": [], + "average_score": 0.6124985410830999 + }, + "Perception": { + "count": 82, + "num_samples": 1321, + "tasks": [], + "average_score": 0.6061759059165749 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.4174603174603175 + }, + "Science": { + "count": 22, + "num_samples": 469, + "tasks": [], + "average_score": 0.5134329832846579 + }, + "Knowledge": { + "count": 77, + "num_samples": 1294, + "tasks": [], + "average_score": 0.5401030980230185 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.4760293511799448 + } + } + } +} \ No newline at end of file diff --git a/static/eval_results/SI/Claude_3.5/task_results.json b/static/eval_results/SI/Claude_3.5/task_results.json new file mode 100644 index 0000000000000000000000000000000000000000..c8339ec0de99b6d9e20d3247591052dc6ab7fd8c --- /dev/null +++ b/static/eval_results/SI/Claude_3.5/task_results.json @@ -0,0 +1,4818 @@ +[ + { + "name": "image_translation_en2cn", + "score": 0.47189890122171807, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "CLEVRER_physics", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_find_odd_one_out", + "score": 0.8928571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_overlapped_circle", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Mobile_tap", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_fit_pattern", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ishihara_test", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "relative_depth_of_different_points", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "autonomous_driving_scene_analysis", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_count_line_intersections", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "llavaguard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "knowledge_graph_understanding", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "game_platform_support_identification", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "recover_masked_word_in_figure", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "healthcare_info_judgement", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Ethical and Safety Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "circuit_diagram_understanding", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "paper_vqa", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "product_ocr_qa", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "counting", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "music_sheet_note_count", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "signage_navigation", + "score": 0.8666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "media_QA_web_stackoverflow", + "score": 0.6666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "Ad_count_detection", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_grid", + "score": 0.9642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "location_vqa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "flowchart_code_generation", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_circled_letter", + "score": 0.8214285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "song_title_identification_from_lyrics", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "transit_map_intersection_points", + "score": 0.7172619047619049, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "webpage_code_understanding", + "score": 0.7777777777777778, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "chess_find_legal_moves", + "score": 0.06698805429719713, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "topological_sort", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MMMU_pro_exam_screenshot", + "score": 0.42424242424242425, + "eval_type": "rule", + "num_demo": 1, + "num_query": 99, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "monthly_weather_days_count", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "mindmap_elements_parsing", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "interpret_force_perspective_illusion", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "annoying_word_search", + "score": 0.0035714285714285718, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "planning_visual_storage", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_nested_squares", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "polygon_interior_angles", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Web_Multi", + "score": 0.4642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "mensa_iq_test", + "score": 0.5495098039215687, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "top_video_creator_identification", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "play_go_capture_stone", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "medical_multi_organ_segmentation_rater", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ascii_art_understanding", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Web_Single", + "score": 0.07140372068949602, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_grippers", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "calendar_schedule_suggestion", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "chinese_idiom_recognition", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "web_action_prediction", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "highest_discount_game_price_identification", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Mobile_swipe", + "score": 0.5487385867546344, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "road_map_find_highway_between_two_place", + "score": 0.8235294117647058, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_execution", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "music_sheet_format_QA", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "position_relationship", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "actor_recognition_in_Movie", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "font_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "extract_webpage_headline", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "hashtag_recommendation", + "score": 0.9583333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "Bongard_Problem", + "score": 0.3157894736842105, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "relative_reflectance_of_different_regions", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_physics", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multilingual_movie_info_parsing", + "score": 0.7448979591836732, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_blocksworld", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "soccer_offside", + "score": 0.2222222222222222, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "electricity_load_estimate_plot", + "score": 0.6787142857142856, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "pmc_vqa_medical_image_qa", + "score": 0.7894736842105263, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "number_comparison", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "movie_info_parsing", + "score": 0.7321428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_programming_test_easy", + "score": 0.5416666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 24, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_storage", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "insect_order_classification", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "scibench_calculus_wo_solution", + "score": 0.22448979591836735, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "graph_shortest_path_kamada_kawai", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "mahjong", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chess_sygyzy_endgames", + "score": 0.09714285714285713, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "human_relationship_reasoning", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Commonsense and Social Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "rebus", + "score": 0.5217391304347826, + "eval_type": "rule", + "num_demo": 1, + "num_query": 23, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "graph_shortest_path_planar", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "iconqa", + "score": 0.5263157894736842, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "LaTeX_complex_formula_convertion", + "score": 0.23529411764705882, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "stock_info_parsing", + "score": 0.9747899159663866, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "icon_arithmetic_puzzle", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "stock_price_future_prediction", + "score": 0.8250714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "geometry_length", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "orchestra_score_recognition", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "web_action_grounding", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "traffic_future_prediction_from_line_plot", + "score": 0.6207368421052633, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_biology", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "remaining_playback_time_calculation", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "animal_pose_estimation", + "score": 0.2688508092335989, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "average_humidity_estimate_plot", + "score": 0.7853333333333332, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "music_info_parsing", + "score": 0.7053571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "nlvr2_two_image_compare_qa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "geometry_transformation", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multilingual_game_info_parsing", + "score": 0.8303571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "egocentric_analysis_single_image", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "landmark_recognition_and_qa", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_connectivity", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "youtube_video_info_parsing", + "score": 0.6904761904761906, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "physical_property_reasoning", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "license_plate_recognition", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "famous_building_recognition", + "score": 0.84375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "TV_show_info_parsing", + "score": 0.7698412698412698, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "image_style_recognition", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "graph_theory", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "electricity_plot_future_prediction", + "score": 0.7095421052631579, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "mnist_pattern", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "graph_isomorphism", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "long_string_letter_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geometry_area", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "newspaper_ocr_in_query_box", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_math", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "coco_object_detection_by_query_property", + "score": 0.5565966568582713, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "3d_indoor_scene_text_bbox_prediction", + "score": 0.04739437903890144, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "top_rated_hotel_identification", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "widerface_face_count_and_event_classification", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "math_parity", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "snli_ve_visual_entailment", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "science_basic_physics", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "face_keypoint_detection", + "score": 0.5987447167547407, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "figureqa", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "chess_winner_identification", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "algebra", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "math_convexity_value_estimation", + "score": 0.5753130452443872, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "map_diagram_qa", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "deciphering_oracle_bone", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "funsd_document_qa", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "newspaper_page_parse_and_count", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "weather_info_parsing", + "score": 0.9087301587301589, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "vibe_eval_short_phrase", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "signboard_identification", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "game_info_parsing", + "score": 0.9415584415584416, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_analytic", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "long_string_number_recognition", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "waybill_number_sequence_extraction", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "egocentric_spatial_reasoning", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "single_person_pose_estimation", + "score": 0.333520279485717, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "quizlet_question_solving", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "chart_vqa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "electricity_future_prediction_from_table", + "score": 0.7636842105263157, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "humor_understand_caption_match", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "hotel_booking_confirmation_parsing", + "score": 0.7071428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_descriptive", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "coco_person_detection", + "score": 0.5531252543894322, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "graph_maxflow", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "places365_scene_type_classification", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_blogpost", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_publication", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_homepage", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "reward_models_I2T_reward", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_repeat_length", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_length", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_position_length", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_contain_length", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "panel_images_single_question", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "panel_images_multi_question", + "score": 0.8095238095238094, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_checkmate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_equality", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_crushing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "table_understanding_fact_verification", + "score": 0.9047619047619049, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "table_understanding_complex_question_answering", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "number_puzzle_sudoku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "number_puzzle_kakuro_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_chinese_food_img2en", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_skribbl_io", + "score": 0.15, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_genai_output_chinese", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_doodle_guess", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_cartoon_drawing_guess", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_swap", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_veracity", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_out_of_context", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_background_change", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_clip_stable_diffusion_generate", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_style", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_entity_replace", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_attribute_edit", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "crossword_mini_5x5", + "score": 0.7714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "weather_map_climate_type_temperature_parsing", + "score": 0.8214285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "distinguish_ai_generated_image", + "score": 0.631578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_tyreworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "entertainment_web_game_style", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "scibench_fundamental_wo_solution", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "brand_logo_recognition_and_elaboration", + "score": 0.82, + "eval_type": "rule", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "super_clevr", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "science_molecule_chemistry", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_chemistry", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "chess_puzzle_single_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "exchange_rate_estimate_plot", + "score": 0.9841571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "graph_chordless_cycle", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "mvsa_sentiment_classification", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "math_breakpoint", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "ili_ratio_future_prediction", + "score": 0.3397142857142856, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "dvqa", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geometry_solid", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "question_solution_solving", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "autorater_artifact_reason", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "autorater_artifact", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "counterfactual_arithmetic", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic_alliteration", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_shakespearean_sonnet", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_haiku", + "score": 0.4, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_petrarchian_sonnet_optional_meter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_limerick", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_custom_rhyming_scheme", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "shape_composition_shapes", + "score": 0.5346938775510204, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "shape_composition_colours", + "score": 0.4522108843537415, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_article_authors", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_html", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_article_journal", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_skill_plain", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_equation", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_latex", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_experience_plain", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_employer_plain", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_text_latex", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_markdown", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_school_plain", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_csv", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_indian_celebrity", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_chinese_celebrity", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_famous_treaty", + "score": 0.5357142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_papers", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_math_MATH", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "ocr_math_TheoremQA", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MMSoc_HatefulMemes", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_PolitiFact", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_GossipCop", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Object Recognition and Classification", + "Scene and Event Understanding", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Memotion", + "score": 0.5529411764705884, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_leetcode", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_instagram", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_iphone_settings", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_ppt", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "medical_cell_recognition", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "medical_image_artifacts_indentification", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "medical_counting_lymphocytes", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "medical_blood_vessels_recognition", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_amazon", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_excel", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_youtube", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_twitter", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_alipay", + "score": 0.8235294117647058, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_zoom", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_word", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_tiktok", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "maze_2d_8x8", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_relation", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_distance", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_depth", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_count", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "symbolic_graphics_programs_scalable_vector_graphics", + "score": 0.1111111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "symbolic_graphics_programs_computer_aided_design", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_africa", + "score": 0.8142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_europe", + "score": 0.7, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_asia", + "score": 0.5857142857142856, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_americas", + "score": 0.7, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "cultural_vqa", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "character_recognition_in_TV_shows", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "kvqa_knowledge_aware_qa", + "score": 0.3157894736842105, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "tqa_textbook_qa", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MMMU_physics_chemistry_MCQ", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "arxiv_vqa", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "realworld_qa_en2cn", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "table_understanding", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "handwritten_math_expression_extraction", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "3d_indoor_scene_text_bbox_selection", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_notes", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "red_teaming_racial", + "score": 0.8450000000000001, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "ascii_art_30", + "score": 0.2857142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "table2latex_complex", + "score": 0.7222222222222222, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "meme_explain", + "score": 0.8142857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "scibench_w_solution_open_ended", + "score": 0.258, + "eval_type": "llm", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funny_image_title", + "score": 0.6928571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "guess_image_generation_prompt", + "score": 0.8473684210526317, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "traffic_accident_analysis", + "score": 0.5357142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "vibe-eval", + "score": 0.4928571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "image_humor_understanding", + "score": 0.8931034482758619, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "graph_interpretation", + "score": 0.882758620689655, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "science_figure_explanation", + "score": 0.8551724137931034, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "ocrqa", + "score": 0.8827586206896549, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "electrocardiogram", + "score": 0.3285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "GUI_Chat_Easy", + "score": 0.7307692307692307, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "docci_image_description_long", + "score": 0.7928571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "humor_explanation", + "score": 0.5866666666666666, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "GUI_Chat_Hard", + "score": 0.45806451612903226, + "eval_type": "llm", + "num_demo": 1, + "num_query": 31, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "iq_test", + "score": 0.6482758620689654, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "unusual_images", + "score": 0.8931034482758619, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "tweets_captioning", + "score": 0.6499999999999998, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "art_explanation", + "score": 0.7517241379310345, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bar_chart_interpretation", + "score": 0.6931034482758621, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "figurative_speech_explanation", + "score": 0.8310344827586205, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "defeasible_reasoning", + "score": 0.8551724137931035, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "image_captioning_with_additional_requirements", + "score": 0.9357142857142858, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "visualization_with_code", + "score": 0.65, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "table_understanding_fetaqa", + "score": 0.7071428571428572, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_jailbreak", + "score": 0.9349999999999999, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Text Recognition (OCR)", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_celebrity", + "score": 0.8850000000000001, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_captcha", + "score": 0.10000000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_politics", + "score": 0.8100000000000002, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visualmisleading", + "score": 0.8789473684210528, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_worldclass", + "score": 0.33571428571428574, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_advanced", + "score": 0.3071428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_expert", + "score": 0.4142857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Spanish", + "score": 0.4928571428571429, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_English", + "score": 0.4714285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Russian", + "score": 0.4357142857142858, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Arabic", + "score": 0.5785714285714285, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Japanese", + "score": 0.5714285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_French", + "score": 0.47857142857142854, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + } +] \ No newline at end of file diff --git a/static/eval_results/SI/Claude_3.5_new/summary_results.json b/static/eval_results/SI/Claude_3.5_new/summary_results.json new file mode 100644 index 0000000000000000000000000000000000000000..3c88676ad4f2fab27160f4dcc3b8dfe2136a9e2d --- /dev/null +++ b/static/eval_results/SI/Claude_3.5_new/summary_results.json @@ -0,0 +1,215 @@ +{ + "model_summary": { + "core": { + "num_eval_tasks": 273, + "num_eval_samples": 4116, + "macro_mean_score": 0.5462752278980763, + "micro_mean_score": 0.5417881438289601 + }, + "open": { + "num_eval_tasks": 42, + "num_eval_samples": 813, + "macro_mean_score": 0.6764020657053476, + "micro_mean_score": 0.6924969249692496 + }, + "overall_score": 0.5636254729390457 + }, + "keyword_stats": { + "skills": { + "Text Recognition (OCR)": { + "count": 101, + "num_samples": 1687, + "tasks": [], + "average_score": 0.6242355223474262 + }, + "Object Recognition and Classification": { + "count": 172, + "num_samples": 2714, + "tasks": [], + "average_score": 0.583387314927874 + }, + "Scene and Event Understanding": { + "count": 60, + "num_samples": 1004, + "tasks": [], + "average_score": 0.6507240054983652 + }, + "Mathematical and Logical Reasoning": { + "count": 91, + "num_samples": 1630, + "tasks": [], + "average_score": 0.5171075478248572 + }, + "Language Understanding and Generation": { + "count": 102, + "num_samples": 1713, + "tasks": [], + "average_score": 0.6234123112506059 + }, + "Domain-Specific Knowledge and Skills": { + "count": 46, + "num_samples": 897, + "tasks": [], + "average_score": 0.5426169039575065 + }, + "Spatial and Temporal Reasoning": { + "count": 78, + "num_samples": 1273, + "tasks": [], + "average_score": 0.48795977188332873 + }, + "Planning and Decision Making": { + "count": 23, + "num_samples": 356, + "tasks": [], + "average_score": 0.22440221381985706 + }, + "Commonsense and Social Reasoning": { + "count": 38, + "num_samples": 654, + "tasks": [], + "average_score": 0.6433122980573076 + }, + "Ethical and Safety Reasoning": { + "count": 10, + "num_samples": 170, + "tasks": [], + "average_score": 0.6839924812030076 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 67, + "num_samples": 1123, + "tasks": [], + "average_score": 0.574168555556774 + }, + "Text-Based Images and Documents": { + "count": 53, + "num_samples": 847, + "tasks": [], + "average_score": 0.4705509892153899 + }, + "Photographs": { + "count": 83, + "num_samples": 1315, + "tasks": [], + "average_score": 0.5838312144672865 + }, + "Diagrams and Data Visualizations": { + "count": 88, + "num_samples": 1524, + "tasks": [], + "average_score": 0.5899091882733952 + }, + "Artistic and Creative Content": { + "count": 22, + "num_samples": 389, + "tasks": [], + "average_score": 0.5927094432064197 + }, + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.3619606028475468 + } + }, + "output_format": { + "structured_output": { + "count": 72, + "num_samples": 1121, + "tasks": [], + "average_score": 0.5638133905687104 + }, + "contextual_formatted_text": { + "count": 63, + "num_samples": 975, + "tasks": [], + "average_score": 0.5249488326690246 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.5300876558354416 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.5106873710119535 + }, + "open_ended_output": { + "count": 51, + "num_samples": 991, + "tasks": [], + "average_score": 0.6409616762702612 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.6380252743889108 + } + }, + "input_num": { + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.5636254729390459 + } + }, + "app": { + "Information_Extraction": { + "count": 41, + "num_samples": 644, + "tasks": [], + "average_score": 0.6633000290867174 + }, + "Planning": { + "count": 44, + "num_samples": 714, + "tasks": [], + "average_score": 0.3511145464456188 + }, + "Coding": { + "count": 16, + "num_samples": 244, + "tasks": [], + "average_score": 0.565344887955182 + }, + "Perception": { + "count": 82, + "num_samples": 1321, + "tasks": [], + "average_score": 0.6465631513465354 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.6285714285714286 + }, + "Science": { + "count": 22, + "num_samples": 469, + "tasks": [], + "average_score": 0.5580232103280633 + }, + "Knowledge": { + "count": 77, + "num_samples": 1294, + "tasks": [], + "average_score": 0.5737128945237007 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.4831956110227109 + } + } + } +} \ No newline at end of file diff --git a/static/eval_results/SI/Claude_3.5_new/task_results.json b/static/eval_results/SI/Claude_3.5_new/task_results.json new file mode 100644 index 0000000000000000000000000000000000000000..264152a89dca9bb2f8d943620c545b74d2e3a5ff --- /dev/null +++ b/static/eval_results/SI/Claude_3.5_new/task_results.json @@ -0,0 +1,4818 @@ +[ + { + "name": "medical_cell_recognition", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "position_relationship", + "score": 0.8666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_find_odd_one_out", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "cultural_vqa", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "font_recognition", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "Bongard_Problem", + "score": 0.23684210526315788, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_fit_pattern", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "relative_depth_of_different_points", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "autonomous_driving_scene_analysis", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_count_line_intersections", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "llavaguard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "knowledge_graph_understanding", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "image_translation_en2cn", + "score": 0.40241040325976846, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "CLEVRER_physics", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "recover_masked_word_in_figure", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_overlapped_circle", + "score": 0.75, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_image_artifacts_indentification", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Mobile_tap", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "paper_vqa", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "product_ocr_qa", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "medical_blood_vessels_recognition", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ishihara_test", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "character_recognition_in_TV_shows", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "game_platform_support_identification", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "healthcare_info_judgement", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Ethical and Safety Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "signage_navigation", + "score": 0.8666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "circuit_diagram_understanding", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "Ad_count_detection", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_barman", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "counting", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "flowchart_code_generation", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "music_sheet_note_count", + "score": 0.058823529411764705, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "song_title_identification_from_lyrics", + "score": 0.6071428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "media_QA_web_stackoverflow", + "score": 0.5952380952380951, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "realworld_qa_en2cn", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MMMU_pro_exam_screenshot", + "score": 0.5151515151515151, + "eval_type": "rule", + "num_demo": 1, + "num_query": 99, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "monthly_weather_days_count", + "score": 0.40476190476190477, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_grid", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "location_vqa", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_circled_letter", + "score": 0.9642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_visual_storage", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "transit_map_intersection_points", + "score": 0.6607142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_nested_squares", + "score": 0.6071428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "webpage_code_understanding", + "score": 0.7777777777777778, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Web_Multi", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "mensa_iq_test", + "score": 0.4946078431372549, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_find_legal_moves", + "score": 0.06110399705595322, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "topological_sort", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "play_go_capture_stone", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "medical_multi_organ_segmentation_rater", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "mindmap_elements_parsing", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MMMU_physics_chemistry_MCQ", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "interpret_force_perspective_illusion", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "arxiv_vqa", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "annoying_word_search", + "score": 0.003401360544217687, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_grippers", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chinese_idiom_recognition", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "polygon_interior_angles", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Mobile_swipe", + "score": 0.5750644816731951, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "road_map_find_highway_between_two_place", + "score": 0.7647058823529411, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "top_video_creator_identification", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "entertainment_web_game_style", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ascii_art_understanding", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Web_Single", + "score": 0.02989318393830872, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "scibench_fundamental_wo_solution", + "score": 0.24489795918367346, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "calendar_schedule_suggestion", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "web_action_prediction", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_physics", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "highest_discount_game_price_identification", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "multilingual_movie_info_parsing", + "score": 0.6530612244897959, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_execution", + "score": 0.375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "electricity_load_estimate_plot", + "score": 0.7402142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "music_sheet_format_QA", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "pmc_vqa_medical_image_qa", + "score": 0.7368421052631579, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "movie_info_parsing", + "score": 0.7589285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "actor_recognition_in_Movie", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "extract_webpage_headline", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_programming_test_easy", + "score": 0.4583333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 24, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "hashtag_recommendation", + "score": 0.9583333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "relative_reflectance_of_different_regions", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "insect_order_classification", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_shortest_path_kamada_kawai", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_blocksworld", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "soccer_offside", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "table_understanding", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "number_comparison", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "human_relationship_reasoning", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Commonsense and Social Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "graph_shortest_path_planar", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_storage", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "iconqa", + "score": 0.42105263157894735, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "stock_info_parsing", + "score": 0.9747899159663866, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "stock_price_future_prediction", + "score": 0.8218571428571428, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "scibench_calculus_wo_solution", + "score": 0.32653061224489793, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geometry_length", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "mahjong", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chess_sygyzy_endgames", + "score": 0.18566544566544566, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "traffic_future_prediction_from_line_plot", + "score": 0.8389999999999999, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_biology", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "rebus", + "score": 0.5217391304347826, + "eval_type": "rule", + "num_demo": 1, + "num_query": 23, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "LaTeX_complex_formula_convertion", + "score": 0.17647058823529413, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "icon_arithmetic_puzzle", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "animal_pose_estimation", + "score": 0.26289170215820523, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "average_humidity_estimate_plot", + "score": 0.868, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "orchestra_score_recognition", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "music_info_parsing", + "score": 0.6875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "web_action_grounding", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "nlvr2_two_image_compare_qa", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "geometry_transformation", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "remaining_playback_time_calculation", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "multilingual_game_info_parsing", + "score": 0.6875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "egocentric_analysis_single_image", + "score": 0.2222222222222222, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "landmark_recognition_and_qa", + "score": 0.711111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "weather_map_climate_type_temperature_parsing", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_connectivity", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "youtube_video_info_parsing", + "score": 0.7261904761904762, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "physical_property_reasoning", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "distinguish_ai_generated_image", + "score": 0.631578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "medical_counting_lymphocytes", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "license_plate_recognition", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_tyreworld", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "face_keypoint_detection", + "score": 0.6805363628538211, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "famous_building_recognition", + "score": 0.78125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "figureqa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "TV_show_info_parsing", + "score": 0.8650793650793652, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_winner_identification", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "image_style_recognition", + "score": 0.8666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "graph_theory", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "algebra", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "electricity_plot_future_prediction", + "score": 0.8811526315789474, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "math_convexity_value_estimation", + "score": 0.6339993725717702, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "map_diagram_qa", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "mnist_pattern", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "deciphering_oracle_bone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "funsd_document_qa", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "graph_isomorphism", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "newspaper_page_parse_and_count", + "score": 0.4444444444444445, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "weather_info_parsing", + "score": 0.9166666666666669, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "vibe_eval_short_phrase", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "3d_indoor_scene_text_bbox_selection", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "signboard_identification", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "game_info_parsing", + "score": 0.9480519480519481, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "long_string_letter_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "kvqa_knowledge_aware_qa", + "score": 0.47368421052631576, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "geometry_analytic", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "long_string_number_recognition", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geometry_area", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "waybill_number_sequence_extraction", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "egocentric_spatial_reasoning", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "newspaper_ocr_in_query_box", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "single_person_pose_estimation", + "score": 0.2531109353882501, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "quizlet_question_solving", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "chart_vqa", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "electricity_future_prediction_from_table", + "score": 0.7710526315789472, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "humor_understand_caption_match", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_math", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "hotel_booking_confirmation_parsing", + "score": 0.7642857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "coco_object_detection_by_query_property", + "score": 0.6062431664706708, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geometry_descriptive", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "3d_indoor_scene_text_bbox_prediction", + "score": 0.08106406283795066, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "coco_person_detection", + "score": 0.6274393183836207, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "top_rated_hotel_identification", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "widerface_face_count_and_event_classification", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_maxflow", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "math_parity", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "places365_scene_type_classification", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "snli_ve_visual_entailment", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_blogpost", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_publication", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "brand_logo_recognition_and_elaboration", + "score": 0.82, + "eval_type": "rule", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_homepage", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "reward_models_I2T_reward", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "super_clevr", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_repeat_length", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "science_molecule_chemistry", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "handwritten_math_expression_extraction", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_length", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_chemistry", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_position_length", + "score": 0.8666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_contain_length", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "chess_puzzle_single_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "panel_images_single_question", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "panel_images_multi_question", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "exchange_rate_estimate_plot", + "score": 0.9743499999999999, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_checkmate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_chordless_cycle", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_equality", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_crushing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "mvsa_sentiment_classification", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "table_understanding_fact_verification", + "score": 0.9285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "math_breakpoint", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "table_understanding_complex_question_answering", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ili_ratio_future_prediction", + "score": 0.23921428571428613, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "number_puzzle_sudoku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "dvqa", + "score": 0.9473684210526315, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "number_puzzle_kakuro_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geometry_solid", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "tqa_textbook_qa", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "pictionary_chinese_food_img2en", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "question_solution_solving", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "pictionary_skribbl_io", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_genai_output_chinese", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "science_basic_physics", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_article_journal", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "pictionary_doodle_guess", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_skill_plain", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "pictionary_cartoon_drawing_guess", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_equation", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_swap", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_latex", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_veracity", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ocr_resume_experience_plain", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_out_of_context", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ocr_resume_employer_plain", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_background_change", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ocr_math_text_latex", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_clip_stable_diffusion_generate", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_markdown", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_style", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ocr_resume_school_plain", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_entity_replace", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_csv", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_indian_celebrity", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_attribute_edit", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "memorization_chinese_celebrity", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_famous_treaty", + "score": 0.5357142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_papers", + "score": 0.5666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_math_MATH", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "ocr_math_TheoremQA", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MMSoc_HatefulMemes", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_PolitiFact", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_GossipCop", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Object Recognition and Classification", + "Scene and Event Understanding", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Memotion", + "score": 0.5529411764705883, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_leetcode", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_instagram", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_iphone_settings", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_ppt", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_notes", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_amazon", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_excel", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "crossword_mini_5x5", + "score": 0.6928571428571428, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_youtube", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_twitter", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_alipay", + "score": 0.8235294117647058, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_zoom", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "autorater_artifact_reason", + "score": 0.6, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_word", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_tiktok", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "maze_2d_8x8", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "autorater_artifact", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "counterfactual_arithmetic", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic_alliteration", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_shakespearean_sonnet", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_haiku", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_petrarchian_sonnet_optional_meter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_limerick", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_custom_rhyming_scheme", + "score": 0.4, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "shape_composition_shapes", + "score": 0.5210884353741496, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "shape_composition_colours", + "score": 0.47066326530612246, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_article_authors", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_relation", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_html", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_distance", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_depth", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_count", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "symbolic_graphics_programs_scalable_vector_graphics", + "score": 0.1111111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "symbolic_graphics_programs_computer_aided_design", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_africa", + "score": 0.7428571428571428, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_europe", + "score": 0.6285714285714287, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_asia", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_americas", + "score": 0.6285714285714287, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ascii_art_30", + "score": 0.5, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "table2latex_complex", + "score": 0.6888888888888888, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "meme_explain", + "score": 0.8642857142857142, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "scibench_w_solution_open_ended", + "score": 0.282, + "eval_type": "llm", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funny_image_title", + "score": 0.6357142857142858, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "guess_image_generation_prompt", + "score": 0.8578947368421055, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "traffic_accident_analysis", + "score": 0.7142857142857144, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "image_humor_understanding", + "score": 0.9103448275862066, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "vibe-eval", + "score": 0.6000000000000002, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "graph_interpretation", + "score": 0.8620689655172412, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "science_figure_explanation", + "score": 0.8586206896551724, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "ocrqa", + "score": 0.893103448275862, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "electrocardiogram", + "score": 0.43571428571428567, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "GUI_Chat_Easy", + "score": 0.7038461538461539, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "docci_image_description_long", + "score": 0.7928571428571429, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "humor_explanation", + "score": 0.7266666666666666, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "GUI_Chat_Hard", + "score": 0.4580645161290323, + "eval_type": "llm", + "num_demo": 1, + "num_query": 31, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "iq_test", + "score": 0.6931034482758621, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "unusual_images", + "score": 0.8344827586206894, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "tweets_captioning", + "score": 0.7071428571428572, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "art_explanation", + "score": 0.8379310344827589, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bar_chart_interpretation", + "score": 0.7413793103448276, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "figurative_speech_explanation", + "score": 0.786206896551724, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "defeasible_reasoning", + "score": 0.8379310344827587, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "image_captioning_with_additional_requirements", + "score": 0.942857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "visualization_with_code", + "score": 0.6642857142857144, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "table_understanding_fetaqa", + "score": 0.7785714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_jailbreak", + "score": 0.9650000000000001, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Text Recognition (OCR)", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_celebrity", + "score": 0.825, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_captcha", + "score": 0.10000000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_politics", + "score": 0.7699999999999999, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_racial", + "score": 0.8350000000000002, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visualmisleading", + "score": 0.8842105263157897, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_worldclass", + "score": 0.29999999999999993, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_advanced", + "score": 0.32142857142857145, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_expert", + "score": 0.35, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Spanish", + "score": 0.607142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_English", + "score": 0.6214285714285713, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Russian", + "score": 0.49999999999999994, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Arabic", + "score": 0.6071428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Japanese", + "score": 0.5214285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_French", + "score": 0.5928571428571427, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + } +] \ No newline at end of file diff --git a/static/eval_results/SI/GPT_4o/summary_results.json b/static/eval_results/SI/GPT_4o/summary_results.json new file mode 100644 index 0000000000000000000000000000000000000000..3fd663e31742e6d071f91953e3b43b831fc05c54 --- /dev/null +++ b/static/eval_results/SI/GPT_4o/summary_results.json @@ -0,0 +1,215 @@ +{ + "model_summary": { + "core": { + "num_eval_tasks": 273, + "num_eval_samples": 4116, + "macro_mean_score": 0.5529953662872719, + "micro_mean_score": 0.5483479105928085 + }, + "open": { + "num_eval_tasks": 42, + "num_eval_samples": 813, + "macro_mean_score": 0.6600228904804206, + "micro_mean_score": 0.6801968019680197 + }, + "overall_score": 0.5672657028463584 + }, + "keyword_stats": { + "skills": { + "Text Recognition (OCR)": { + "count": 101, + "num_samples": 1687, + "tasks": [], + "average_score": 0.6400436962819274 + }, + "Object Recognition and Classification": { + "count": 172, + "num_samples": 2714, + "tasks": [], + "average_score": 0.5798789532163023 + }, + "Scene and Event Understanding": { + "count": 60, + "num_samples": 1004, + "tasks": [], + "average_score": 0.6933181759121947 + }, + "Mathematical and Logical Reasoning": { + "count": 91, + "num_samples": 1630, + "tasks": [], + "average_score": 0.47164342831848766 + }, + "Language Understanding and Generation": { + "count": 102, + "num_samples": 1713, + "tasks": [], + "average_score": 0.6512174145248227 + }, + "Domain-Specific Knowledge and Skills": { + "count": 46, + "num_samples": 897, + "tasks": [], + "average_score": 0.5506629280904943 + }, + "Spatial and Temporal Reasoning": { + "count": 78, + "num_samples": 1273, + "tasks": [], + "average_score": 0.4267383416112408 + }, + "Planning and Decision Making": { + "count": 23, + "num_samples": 356, + "tasks": [], + "average_score": 0.1970421212123289 + }, + "Commonsense and Social Reasoning": { + "count": 38, + "num_samples": 654, + "tasks": [], + "average_score": 0.6716375018861761 + }, + "Ethical and Safety Reasoning": { + "count": 10, + "num_samples": 170, + "tasks": [], + "average_score": 0.7342894736842105 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 67, + "num_samples": 1123, + "tasks": [], + "average_score": 0.6093502418300007 + }, + "Text-Based Images and Documents": { + "count": 53, + "num_samples": 847, + "tasks": [], + "average_score": 0.4938444672052553 + }, + "Photographs": { + "count": 83, + "num_samples": 1315, + "tasks": [], + "average_score": 0.6107746700730057 + }, + "Diagrams and Data Visualizations": { + "count": 88, + "num_samples": 1524, + "tasks": [], + "average_score": 0.533172482404735 + }, + "Artistic and Creative Content": { + "count": 22, + "num_samples": 389, + "tasks": [], + "average_score": 0.6086090683867454 + }, + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.3427989299648589 + } + }, + "output_format": { + "structured_output": { + "count": 72, + "num_samples": 1121, + "tasks": [], + "average_score": 0.5370230887013343 + }, + "contextual_formatted_text": { + "count": 63, + "num_samples": 975, + "tasks": [], + "average_score": 0.5351259728352326 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.6016521462358102 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.4632537848154335 + }, + "open_ended_output": { + "count": 51, + "num_samples": 991, + "tasks": [], + "average_score": 0.6563556079088679 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.6204512659058113 + } + }, + "input_num": { + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.5672657028463585 + } + }, + "app": { + "Information_Extraction": { + "count": 41, + "num_samples": 644, + "tasks": [], + "average_score": 0.7387886231372116 + }, + "Planning": { + "count": 44, + "num_samples": 714, + "tasks": [], + "average_score": 0.302146719713088 + }, + "Coding": { + "count": 16, + "num_samples": 244, + "tasks": [], + "average_score": 0.5785991479925302 + }, + "Perception": { + "count": 82, + "num_samples": 1321, + "tasks": [], + "average_score": 0.6418126331560571 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.626984126984127 + }, + "Science": { + "count": 22, + "num_samples": 469, + "tasks": [], + "average_score": 0.5184702350129554 + }, + "Knowledge": { + "count": 77, + "num_samples": 1294, + "tasks": [], + "average_score": 0.6073751328612617 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.4387500704123191 + } + } + } +} \ No newline at end of file diff --git a/static/eval_results/SI/GPT_4o/task_results.json b/static/eval_results/SI/GPT_4o/task_results.json new file mode 100644 index 0000000000000000000000000000000000000000..876a4f2ad46b798d3dd71ec35bfa178b47f72cc4 --- /dev/null +++ b/static/eval_results/SI/GPT_4o/task_results.json @@ -0,0 +1,4818 @@ +[ + { + "name": "image_translation_en2cn", + "score": 0.5564421945052599, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "CLEVRER_physics", + "score": 0.45, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_find_odd_one_out", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_overlapped_circle", + "score": 0.75, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_fit_pattern", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Mobile_tap", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "relative_depth_of_different_points", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "autonomous_driving_scene_analysis", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_count_line_intersections", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ishihara_test", + "score": 0.5571428571428572, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "llavaguard", + "score": 0.6071428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "knowledge_graph_understanding", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "recover_masked_word_in_figure", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "paper_vqa", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "game_platform_support_identification", + "score": 0.8928571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "product_ocr_qa", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "healthcare_info_judgement", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Ethical and Safety Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "circuit_diagram_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "counting", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "signage_navigation", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "music_sheet_note_count", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "Ad_count_detection", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "flowchart_code_generation", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "song_title_identification_from_lyrics", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "media_QA_web_stackoverflow", + "score": 0.619047619047619, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MMMU_pro_exam_screenshot", + "score": 0.5050505050505051, + "eval_type": "rule", + "num_demo": 1, + "num_query": 99, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_grid", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "location_vqa", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_circled_letter", + "score": 0.6071428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "monthly_weather_days_count", + "score": 0.3571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "transit_map_intersection_points", + "score": 0.5625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "webpage_code_understanding", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "planning_visual_storage", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_nested_squares", + "score": 0.4642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Web_Multi", + "score": 0.4379245788668292, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "mensa_iq_test", + "score": 0.40294117647058825, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_find_legal_moves", + "score": 0.1858388265990491, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "play_go_capture_stone", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "topological_sort", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_multi_organ_segmentation_rater", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "mindmap_elements_parsing", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_grippers", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "interpret_force_perspective_illusion", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chinese_idiom_recognition", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Mobile_swipe", + "score": 0.5925323909834338, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "road_map_find_highway_between_two_place", + "score": 0.6470588235294118, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "annoying_word_search", + "score": 0.0009041591320072332, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "position_relationship", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "polygon_interior_angles", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "font_recognition", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "top_video_creator_identification", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "Bongard_Problem", + "score": 0.21052631578947367, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ascii_art_understanding", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Web_Single", + "score": 0.01601312748867357, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "actor_recognition_in_Movie", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "extract_webpage_headline", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "hashtag_recommendation", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "calendar_schedule_suggestion", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "relative_reflectance_of_different_regions", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "web_action_prediction", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "highest_discount_game_price_identification", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_execution", + "score": 0.75, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_blocksworld", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "music_sheet_format_QA", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "soccer_offside", + "score": 0.2222222222222222, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_physics", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "number_comparison", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "multilingual_movie_info_parsing", + "score": 0.6632653061224488, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "electricity_load_estimate_plot", + "score": 0.4767857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_storage", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "pmc_vqa_medical_image_qa", + "score": 0.8947368421052632, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "movie_info_parsing", + "score": 0.6875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "scibench_calculus_wo_solution", + "score": 0.3673469387755102, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "code_programming_test_easy", + "score": 0.4583333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 24, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "mahjong", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chess_sygyzy_endgames", + "score": 0.1496598639455782, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "rebus", + "score": 0.6956521739130435, + "eval_type": "rule", + "num_demo": 1, + "num_query": 23, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "insect_order_classification", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "graph_shortest_path_kamada_kawai", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "LaTeX_complex_formula_convertion", + "score": 0.5294117647058824, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "icon_arithmetic_puzzle", + "score": 0.5357142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "human_relationship_reasoning", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Commonsense and Social Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "orchestra_score_recognition", + "score": 0.32142857142857145, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "web_action_grounding", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "graph_shortest_path_planar", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "iconqa", + "score": 0.2631578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "remaining_playback_time_calculation", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "stock_info_parsing", + "score": 0.9747899159663866, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "stock_price_future_prediction", + "score": 0.7872142857142859, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "weather_map_climate_type_temperature_parsing", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_length", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "traffic_future_prediction_from_line_plot", + "score": 0.6918947368421055, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_biology", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "distinguish_ai_generated_image", + "score": 0.6842105263157895, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "animal_pose_estimation", + "score": 0.2785198065092178, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "average_humidity_estimate_plot", + "score": 0.828, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_tyreworld", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "music_info_parsing", + "score": 0.75, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "nlvr2_two_image_compare_qa", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "geometry_transformation", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "entertainment_web_game_style", + "score": 0.8214285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "multilingual_game_info_parsing", + "score": 0.8303571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "egocentric_analysis_single_image", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "landmark_recognition_and_qa", + "score": 0.711111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_connectivity", + "score": 0.95, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "youtube_video_info_parsing", + "score": 0.8095238095238095, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "physical_property_reasoning", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "scibench_fundamental_wo_solution", + "score": 0.3469387755102041, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "license_plate_recognition", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "science_basic_physics", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "face_keypoint_detection", + "score": 0.5982549376215841, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "famous_building_recognition", + "score": 0.875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "TV_show_info_parsing", + "score": 0.8253968253968255, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "image_style_recognition", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "figureqa", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "graph_theory", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chess_winner_identification", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "electricity_plot_future_prediction", + "score": 0.7131684210526317, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "algebra", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "mnist_pattern", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "math_convexity_value_estimation", + "score": 0.5867591836191252, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "map_diagram_qa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "deciphering_oracle_bone", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "graph_isomorphism", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "funsd_document_qa", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "newspaper_page_parse_and_count", + "score": 0.6444444444444445, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "weather_info_parsing", + "score": 0.9285714285714288, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "vibe_eval_short_phrase", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "signboard_identification", + "score": 0.8666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "game_info_parsing", + "score": 0.8766233766233764, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "long_string_letter_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geometry_analytic", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "long_string_number_recognition", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "waybill_number_sequence_extraction", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "egocentric_spatial_reasoning", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "geometry_area", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "single_person_pose_estimation", + "score": 0.2903422951989705, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "newspaper_ocr_in_query_box", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "quizlet_question_solving", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "chart_vqa", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "electricity_future_prediction_from_table", + "score": 0.7417368421052631, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "humor_understand_caption_match", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "hotel_booking_confirmation_parsing", + "score": 0.7142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_descriptive", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "coco_person_detection", + "score": 0.6477943776571286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_math", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "graph_maxflow", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "places365_scene_type_classification", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_blogpost", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "coco_object_detection_by_query_property", + "score": 0.5807339650392197, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "3d_indoor_scene_text_bbox_prediction", + "score": 0.18559785992971775, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "top_rated_hotel_identification", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_publication", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "widerface_face_count_and_event_classification", + "score": 0.6071428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_homepage", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "math_parity", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "reward_models_I2T_reward", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "snli_ve_visual_entailment", + "score": 0.8666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "brand_logo_recognition_and_elaboration", + "score": 0.74, + "eval_type": "rule", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_repeat_length", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "super_clevr", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_length", + "score": 0.8, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "science_molecule_chemistry", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_position_length", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_chemistry", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_contain_length", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "chess_puzzle_single_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "panel_images_single_question", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "panel_images_multi_question", + "score": 0.8333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_checkmate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "exchange_rate_estimate_plot", + "score": 0.9764785714285713, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_equality", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_chordless_cycle", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_crushing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "table_understanding_fact_verification", + "score": 0.9047619047619048, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "table_understanding_complex_question_answering", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "mvsa_sentiment_classification", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "math_breakpoint", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "number_puzzle_sudoku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ili_ratio_future_prediction", + "score": 0.12478571428571421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "dvqa", + "score": 0.3684210526315789, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "number_puzzle_kakuro_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_chinese_food_img2en", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geometry_solid", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "pictionary_skribbl_io", + "score": 0.55, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_genai_output_chinese", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_doodle_guess", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_cartoon_drawing_guess", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_swap", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_veracity", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_out_of_context", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "question_solution_solving", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_background_change", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_clip_stable_diffusion_generate", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_style", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_entity_replace", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_attribute_edit", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ocr_math_TheoremQA", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MMSoc_HatefulMemes", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_PolitiFact", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_GossipCop", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Object Recognition and Classification", + "Scene and Event Understanding", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Memotion", + "score": 0.6000000000000002, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_leetcode", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_instagram", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_iphone_settings", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_ppt", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_amazon", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_excel", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_youtube", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_twitter", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_alipay", + "score": 0.7647058823529411, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_zoom", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_word", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_tiktok", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "maze_2d_8x8", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "crossword_mini_5x5", + "score": 0.6785714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "autorater_artifact_reason", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "autorater_artifact", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "counterfactual_arithmetic", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic_alliteration", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_shakespearean_sonnet", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_haiku", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_petrarchian_sonnet_optional_meter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_relation", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic", + "score": 0.8666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_limerick", + "score": 0.8, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_distance", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "poetry_custom_rhyming_scheme", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_depth", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_count", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "shape_composition_shapes", + "score": 0.4562925170068027, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "shape_composition_colours", + "score": 0.36553287981859406, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_article_authors", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_html", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_article_journal", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_skill_plain", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_equation", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "symbolic_graphics_programs_scalable_vector_graphics", + "score": 0.1111111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_latex", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_experience_plain", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_employer_plain", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_text_latex", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_markdown", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_school_plain", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_csv", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_indian_celebrity", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_chinese_celebrity", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_famous_treaty", + "score": 0.6785714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "symbolic_graphics_programs_computer_aided_design", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "memorization_papers", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_africa", + "score": 0.7428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_europe", + "score": 0.7571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_asia", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_MATH", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_americas", + "score": 0.5428571428571428, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "medical_cell_recognition", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "medical_image_artifacts_indentification", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "medical_counting_lymphocytes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "medical_blood_vessels_recognition", + "score": 0.6785714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "cultural_vqa", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "character_recognition_in_TV_shows", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "kvqa_knowledge_aware_qa", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "tqa_textbook_qa", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "3d_indoor_scene_text_bbox_selection", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_notes", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "table_understanding", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "MMMU_physics_chemistry_MCQ", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "arxiv_vqa", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "realworld_qa_en2cn", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "handwritten_math_expression_extraction", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "red_teaming_jailbreak", + "score": 0.8949999999999999, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Text Recognition (OCR)", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_celebrity", + "score": 0.9, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_politics", + "score": 0.7250000000000002, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_racial", + "score": 0.765, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visualmisleading", + "score": 0.7578947368421054, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Spanish", + "score": 0.7142857142857144, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_English", + "score": 0.7214285714285715, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Russian", + "score": 0.6, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Arabic", + "score": 0.7071428571428572, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Japanese", + "score": 0.65, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_French", + "score": 0.6785714285714285, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "ascii_art_30", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "table2latex_complex", + "score": 0.6777777777777777, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_captcha", + "score": 0.10000000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "meme_explain", + "score": 0.9142857142857145, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "scibench_w_solution_open_ended", + "score": 0.36200000000000004, + "eval_type": "llm", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funny_image_title", + "score": 0.6928571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "guess_image_generation_prompt", + "score": 0.8315789473684211, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "traffic_accident_analysis", + "score": 0.37857142857142856, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "vibe-eval", + "score": 0.6642857142857144, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "electrocardiogram", + "score": 0.3, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "GUI_Chat_Easy", + "score": 0.7423076923076924, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "docci_image_description_long", + "score": 0.8428571428571429, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "humor_explanation", + "score": 0.8666666666666668, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "GUI_Chat_Hard", + "score": 0.49354838709677434, + "eval_type": "llm", + "num_demo": 1, + "num_query": 31, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "tweets_captioning", + "score": 0.6, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "image_captioning_with_additional_requirements", + "score": 0.9214285714285716, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "visualization_with_code", + "score": 0.6357142857142858, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "table_understanding_fetaqa", + "score": 0.6785714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_worldclass", + "score": 0.2857142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_advanced", + "score": 0.2785714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_expert", + "score": 0.3571428571428572, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "image_humor_understanding", + "score": 0.8620689655172411, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "graph_interpretation", + "score": 0.8310344827586206, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "science_figure_explanation", + "score": 0.8793103448275862, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "ocrqa", + "score": 0.8689655172413793, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "iq_test", + "score": 0.7310344827586206, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "unusual_images", + "score": 0.9068965517241377, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "art_explanation", + "score": 0.6172413793103447, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bar_chart_interpretation", + "score": 0.627586206896552, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "figurative_speech_explanation", + "score": 0.8310344827586207, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "defeasible_reasoning", + "score": 0.8275862068965518, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + } +] \ No newline at end of file diff --git a/static/eval_results/SI/GPT_4o_mini/summary_results.json b/static/eval_results/SI/GPT_4o_mini/summary_results.json new file mode 100644 index 0000000000000000000000000000000000000000..cc69433e9989576e5a8c7974b79e624af8c0838b --- /dev/null +++ b/static/eval_results/SI/GPT_4o_mini/summary_results.json @@ -0,0 +1,215 @@ +{ + "model_summary": { + "core": { + "num_eval_tasks": 273, + "num_eval_samples": 4116, + "macro_mean_score": 0.4431039098921726, + "micro_mean_score": 0.43780369290573373 + }, + "open": { + "num_eval_tasks": 42, + "num_eval_samples": 813, + "macro_mean_score": 0.595574663769726, + "micro_mean_score": 0.6334563345633456 + }, + "overall_score": 0.46343334374251305 + }, + "keyword_stats": { + "skills": { + "Text Recognition (OCR)": { + "count": 101, + "num_samples": 1687, + "tasks": [], + "average_score": 0.503118803606002 + }, + "Object Recognition and Classification": { + "count": 172, + "num_samples": 2714, + "tasks": [], + "average_score": 0.48241878593503174 + }, + "Scene and Event Understanding": { + "count": 60, + "num_samples": 1004, + "tasks": [], + "average_score": 0.5987052352447554 + }, + "Mathematical and Logical Reasoning": { + "count": 91, + "num_samples": 1630, + "tasks": [], + "average_score": 0.37680368570252215 + }, + "Language Understanding and Generation": { + "count": 102, + "num_samples": 1713, + "tasks": [], + "average_score": 0.5458509360302554 + }, + "Domain-Specific Knowledge and Skills": { + "count": 46, + "num_samples": 897, + "tasks": [], + "average_score": 0.4555977624507237 + }, + "Spatial and Temporal Reasoning": { + "count": 78, + "num_samples": 1273, + "tasks": [], + "average_score": 0.33277278942510824 + }, + "Planning and Decision Making": { + "count": 23, + "num_samples": 356, + "tasks": [], + "average_score": 0.17294565844175996 + }, + "Commonsense and Social Reasoning": { + "count": 38, + "num_samples": 654, + "tasks": [], + "average_score": 0.5775026308600164 + }, + "Ethical and Safety Reasoning": { + "count": 10, + "num_samples": 170, + "tasks": [], + "average_score": 0.7960714285714285 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 67, + "num_samples": 1123, + "tasks": [], + "average_score": 0.4645916955325127 + }, + "Text-Based Images and Documents": { + "count": 53, + "num_samples": 847, + "tasks": [], + "average_score": 0.3779902828155749 + }, + "Photographs": { + "count": 83, + "num_samples": 1315, + "tasks": [], + "average_score": 0.5569877095654321 + }, + "Diagrams and Data Visualizations": { + "count": 88, + "num_samples": 1524, + "tasks": [], + "average_score": 0.4194828137611333 + }, + "Artistic and Creative Content": { + "count": 22, + "num_samples": 389, + "tasks": [], + "average_score": 0.5198662454862603 + }, + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.1194248916897328 + } + }, + "output_format": { + "structured_output": { + "count": 72, + "num_samples": 1121, + "tasks": [], + "average_score": 0.4761935495255144 + }, + "contextual_formatted_text": { + "count": 63, + "num_samples": 975, + "tasks": [], + "average_score": 0.38282644938937405 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.42048902061937554 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.3777213713726476 + }, + "open_ended_output": { + "count": 51, + "num_samples": 991, + "tasks": [], + "average_score": 0.5986898724975707 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.5559184922821285 + } + }, + "input_num": { + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.46343334374251277 + } + }, + "app": { + "Information_Extraction": { + "count": 41, + "num_samples": 644, + "tasks": [], + "average_score": 0.5484747566251307 + }, + "Planning": { + "count": 44, + "num_samples": 714, + "tasks": [], + "average_score": 0.22983305008250185 + }, + "Coding": { + "count": 16, + "num_samples": 244, + "tasks": [], + "average_score": 0.4556095354808589 + }, + "Perception": { + "count": 82, + "num_samples": 1321, + "tasks": [], + "average_score": 0.5437015929631214 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.4873015873015873 + }, + "Science": { + "count": 22, + "num_samples": 469, + "tasks": [], + "average_score": 0.39601047285667923 + }, + "Knowledge": { + "count": 77, + "num_samples": 1294, + "tasks": [], + "average_score": 0.535145025177205 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.33759329198549914 + } + } + } +} \ No newline at end of file diff --git a/static/eval_results/SI/GPT_4o_mini/task_results.json b/static/eval_results/SI/GPT_4o_mini/task_results.json new file mode 100644 index 0000000000000000000000000000000000000000..25b2399a65df86e6e32cbc02e5ce63907af6297b --- /dev/null +++ b/static/eval_results/SI/GPT_4o_mini/task_results.json @@ -0,0 +1,4818 @@ +[ + { + "name": "relative_reflectance_of_different_regions", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "transit_map_intersection_points", + "score": 0.4404761904761905, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_grid", + "score": 0.39285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_overlapped_circle", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "webpage_code_understanding", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Mobile_tap", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_find_odd_one_out", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Web_Single", + "score": 0.02971437714058806, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "entertainment_web_game_style", + "score": 0.75, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ascii_art_understanding", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "music_sheet_format_QA", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "scibench_fundamental_wo_solution", + "score": 0.2653061224489796, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_tyreworld", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chinese_idiom_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_blocksworld", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_multi_organ_segmentation_rater", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "interpret_force_perspective_illusion", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "recover_masked_word_in_figure", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_nested_squares", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "signage_navigation", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "circuit_diagram_understanding", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "product_ocr_qa", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "soccer_offside", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "distinguish_ai_generated_image", + "score": 0.631578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "web_action_grounding", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "song_title_identification_from_lyrics", + "score": 0.6785714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "mindmap_elements_parsing", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "rebus", + "score": 0.30434782608695654, + "eval_type": "rule", + "num_demo": 1, + "num_query": 23, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "road_map_find_highway_between_two_place", + "score": 0.6470588235294118, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_fit_pattern", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "hashtag_recommendation", + "score": 0.9119047619047619, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "extract_webpage_headline", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Web_Multi", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "relative_depth_of_different_points", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "mensa_iq_test", + "score": 0.3348039215686274, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "paper_vqa", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "counting", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "llavaguard", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "weather_map_climate_type_temperature_parsing", + "score": 0.8214285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "media_QA_web_stackoverflow", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "location_vqa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "remaining_playback_time_calculation", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "monthly_weather_days_count", + "score": 0.49999999999999994, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "icon_arithmetic_puzzle", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "topological_sort", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "game_platform_support_identification", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "mahjong", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_count_line_intersections", + "score": 0.39285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_circled_letter", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "autorater_artifact_reason", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_employer_plain", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_article_journal", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_experience_plain", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_equation", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_storage", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ishihara_test", + "score": 0.5857142857142856, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_skill_plain", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "panel_images_single_question", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_markdown", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "orchestra_score_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_html", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "font_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_text_latex", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "panel_images_multi_question", + "score": 0.6904761904761906, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "poetry_shakespearean_sonnet", + "score": 0.4, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_haiku", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_latex", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "poetry_custom_rhyming_scheme", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "number_puzzle_kakuro_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ocr_article_authors", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_famous_treaty", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "pictionary_genai_output_chinese", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "number_puzzle_sudoku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "crossword_mini_5x5", + "score": 0.48571428571428577, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_chinese_celebrity", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_twitter", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Mobile_swipe", + "score": 0.43979842890651355, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_indian_celebrity", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_csv", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "pictionary_cartoon_drawing_guess", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "flowchart_code_generation", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_school_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "pictionary_skribbl_io", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "Ad_count_detection", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "memorization_papers", + "score": 0.36666666666666664, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "reward_models_I2T_reward", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "calendar_schedule_suggestion", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "pictionary_doodle_guess", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "CLEVRER_physics", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_ppt", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "poetry_petrarchian_sonnet_optional_meter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "table_understanding_fact_verification", + "score": 0.7857142857142856, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "poetry_limerick", + "score": 1.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "pictionary_chinese_food_img2en", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_homepage", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_execution", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_alipay", + "score": 0.5294117647058824, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_blogpost", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "counterfactual_arithmetic", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_publication", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic_alliteration", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "actor_recognition_in_Movie", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_clip_stable_diffusion_generate", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_tiktok", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_excel", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_background_change", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_attribute_edit", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_youtube", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_length", + "score": 0.8666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "MMMU_pro_exam_screenshot", + "score": 0.29292929292929293, + "eval_type": "rule", + "num_demo": 1, + "num_query": 99, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_out_of_context", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_swap", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "polygon_interior_angles", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_style", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "scibench_calculus_wo_solution", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "ocr_math_MATH", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_amazon", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_entity_replace", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_HatefulMemes", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_math", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Memotion", + "score": 0.5882352941176472, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "super_clevr", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_word", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_TheoremQA", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_PolitiFact", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_GossipCop", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Object Recognition and Classification", + "Scene and Event Understanding", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "shape_composition_shapes", + "score": 0.22491496598639452, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "license_plate_recognition", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "knowledge_graph_understanding", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "newspaper_ocr_in_query_box", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_instagram", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "shape_composition_colours", + "score": 0.2505668934240363, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_checkmate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "Bongard_Problem", + "score": 0.17982456140350878, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "chess_find_legal_moves", + "score": 0.048713528589567665, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "web_action_prediction", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "3d_indoor_scene_text_bbox_prediction", + "score": 0.024564069093751337, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_crushing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "dvqa", + "score": 0.47368421052631576, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_equality", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "multilingual_movie_info_parsing", + "score": 0.6020408163265306, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_zoom", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "deciphering_oracle_bone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "insect_order_classification", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "vibe_eval_short_phrase", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "graph_shortest_path_planar", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "long_string_number_recognition", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "average_humidity_estimate_plot", + "score": 0.6480000000000001, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "table_understanding_complex_question_answering", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_chemistry", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "nlvr2_two_image_compare_qa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "math_breakpoint", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_iphone_settings", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "math_parity", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "electricity_future_prediction_from_table", + "score": 0.7010526315789474, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "electricity_plot_future_prediction", + "score": 0.43662631578947375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_biology", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "question_solution_solving", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "graph_isomorphism", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_leetcode", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "iconqa", + "score": 0.2631578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "quizlet_question_solving", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "brand_logo_recognition_and_elaboration", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "stock_price_future_prediction", + "score": 0.6955714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "widerface_face_count_and_event_classification", + "score": 0.6071428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "math_convexity_value_estimation", + "score": 0.457498007685276, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "figureqa", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "geometry_descriptive", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "science_molecule_chemistry", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "physical_property_reasoning", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "waybill_number_sequence_extraction", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "coco_object_detection_by_query_property", + "score": 0.5423192899685483, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "signboard_identification", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "graph_chordless_cycle", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "newspaper_page_parse_and_count", + "score": 0.3777777777777777, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "snli_ve_visual_entailment", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "face_keypoint_detection", + "score": 0.6498716440678927, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "funsd_document_qa", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "electricity_load_estimate_plot", + "score": 0.5455714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "mnist_pattern", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_americas", + "score": 0.27142857142857146, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "mvsa_sentiment_classification", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "movie_info_parsing", + "score": 0.6964285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "traffic_future_prediction_from_line_plot", + "score": 0.7275263157894736, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "code_programming_test_easy", + "score": 0.20833333333333334, + "eval_type": "rule", + "num_demo": 1, + "num_query": 24, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "autonomous_driving_scene_analysis", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "graph_theory", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_contain_length", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "coco_person_detection", + "score": 0.5535393001296958, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geometry_analytic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "autorater_artifact", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "play_go_capture_stone", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "image_style_recognition", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "algebra", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "exchange_rate_estimate_plot", + "score": 0.9505142857142858, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "famous_building_recognition", + "score": 0.9375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "music_info_parsing", + "score": 0.6875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "top_video_creator_identification", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_repeat_length", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_position_length", + "score": 0.8, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "science_basic_physics", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "geometry_transformation", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "animal_pose_estimation", + "score": 0.1969956173950675, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "multilingual_game_info_parsing", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chart_vqa", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_visual_storage", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "single_person_pose_estimation", + "score": 0.24388210678357394, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_connectivity", + "score": 0.4166666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "LaTeX_complex_formula_convertion", + "score": 0.29411764705882354, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "youtube_video_info_parsing", + "score": 0.75, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "map_diagram_qa", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "graph_maxflow", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "pmc_vqa_medical_image_qa", + "score": 0.5789473684210527, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "places365_scene_type_classification", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "landmark_recognition_and_qa", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "music_sheet_note_count", + "score": 0.058823529411764705, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "egocentric_analysis_single_image", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geometry_solid", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "chess_winner_identification", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "symbolic_graphics_programs_scalable_vector_graphics", + "score": 0.1111111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "game_info_parsing", + "score": 0.8506493506493505, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "symbolic_graphics_programs_computer_aided_design", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_relation", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_distance", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_grippers", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_shortest_path_kamada_kawai", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_count", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_physics", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "geometry_length", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_depth", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "hotel_booking_confirmation_parsing", + "score": 0.6357142857142858, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "weather_info_parsing", + "score": 0.8769841269841271, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "egocentric_spatial_reasoning", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "stock_info_parsing", + "score": 0.9705882352941178, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_area", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_africa", + "score": 0.37142857142857144, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "human_relationship_reasoning", + "score": 0.8125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Commonsense and Social Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_europe", + "score": 0.11428571428571428, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "humor_understand_caption_match", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_asia", + "score": 0.41428571428571426, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "chess_puzzle_single_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "position_relationship", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "number_comparison", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "image_translation_en2cn", + "score": 0.43050085804176885, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "highest_discount_game_price_identification", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "annoying_word_search", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "medical_image_artifacts_indentification", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "medical_cell_recognition", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "maze_2d_8x8", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_veracity", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "healthcare_info_judgement", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Ethical and Safety Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "medical_counting_lymphocytes", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "chess_sygyzy_endgames", + "score": 0.05714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "long_string_letter_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ili_ratio_future_prediction", + "score": 0.004214285714285663, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "TV_show_info_parsing", + "score": 0.7539682539682541, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "top_rated_hotel_identification", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "cultural_vqa", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "medical_blood_vessels_recognition", + "score": 0.5357142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "character_recognition_in_TV_shows", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "kvqa_knowledge_aware_qa", + "score": 0.10526315789473684, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "tqa_textbook_qa", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "arxiv_vqa", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "realworld_qa_en2cn", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MMMU_physics_chemistry_MCQ", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "handwritten_math_expression_extraction", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "3d_indoor_scene_text_bbox_selection", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "table_understanding", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_notes", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "electrocardiogram", + "score": 0.2785714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "ascii_art_30", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "tweets_captioning", + "score": 0.6142857142857142, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "humor_explanation", + "score": 0.8200000000000001, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "docci_image_description_long", + "score": 0.7214285714285715, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funny_image_title", + "score": 0.65, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "art_explanation", + "score": 0.6931034482758621, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "GUI_Chat_Easy", + "score": 0.7615384615384616, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "image_humor_understanding", + "score": 0.875862068965517, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bar_chart_interpretation", + "score": 0.5206896551724138, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "figurative_speech_explanation", + "score": 0.8379310344827584, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "meme_explain", + "score": 0.8857142857142858, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "unusual_images", + "score": 0.872413793103448, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "iq_test", + "score": 0.7206896551724139, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "science_figure_explanation", + "score": 0.8482758620689654, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "defeasible_reasoning", + "score": 0.817241379310345, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "graph_interpretation", + "score": 0.8379310344827586, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "image_captioning_with_additional_requirements", + "score": 0.9000000000000001, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "guess_image_generation_prompt", + "score": 0.8210526315789474, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "ocrqa", + "score": 0.7827586206896552, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "GUI_Chat_Hard", + "score": 0.4548387096774193, + "eval_type": "llm", + "num_demo": 1, + "num_query": 31, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "traffic_accident_analysis", + "score": 0.6857142857142858, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "vibe-eval", + "score": 0.5714285714285715, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "table2latex_complex", + "score": 0.5666666666666667, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_celebrity", + "score": 0.8950000000000001, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visualmisleading", + "score": 0.8000000000000002, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_captcha", + "score": 0.10000000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_jailbreak", + "score": 0.8800000000000001, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Text Recognition (OCR)", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_French", + "score": 0.42857142857142855, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Japanese", + "score": 0.14285714285714285, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Arabic", + "score": 0.05714285714285715, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_racial", + "score": 0.7750000000000001, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Russian", + "score": 0.2071428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Spanish", + "score": 0.16428571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "visualization_with_code", + "score": 0.5142857142857142, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_English", + "score": 0.3285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_expert", + "score": 0.33571428571428574, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_politics", + "score": 0.72, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "table_understanding_fetaqa", + "score": 0.6, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "scibench_w_solution_open_ended", + "score": 0.3559999999999999, + "eval_type": "llm", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_advanced", + "score": 0.6571428571428573, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_worldclass", + "score": 0.5142857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + } +] \ No newline at end of file diff --git a/static/eval_results/SI/Gemini_1.5_flash_002/summary_results.json b/static/eval_results/SI/Gemini_1.5_flash_002/summary_results.json new file mode 100644 index 0000000000000000000000000000000000000000..95d4d30ad782486786b62a3c713ef22d885e1b95 --- /dev/null +++ b/static/eval_results/SI/Gemini_1.5_flash_002/summary_results.json @@ -0,0 +1,215 @@ +{ + "model_summary": { + "core": { + "num_eval_tasks": 273, + "num_eval_samples": 4116, + "macro_mean_score": 0.43481964330318734, + "micro_mean_score": 0.4297862001943635 + }, + "open": { + "num_eval_tasks": 42, + "num_eval_samples": 813, + "macro_mean_score": 0.5787083135236054, + "micro_mean_score": 0.6186961869618696 + }, + "overall_score": 0.4540047993325765 + }, + "keyword_stats": { + "skills": { + "Text Recognition (OCR)": { + "count": 101, + "num_samples": 1687, + "tasks": [], + "average_score": 0.4474763430506795 + }, + "Object Recognition and Classification": { + "count": 172, + "num_samples": 2714, + "tasks": [], + "average_score": 0.47630441828533016 + }, + "Scene and Event Understanding": { + "count": 60, + "num_samples": 1004, + "tasks": [], + "average_score": 0.5920539115535787 + }, + "Mathematical and Logical Reasoning": { + "count": 91, + "num_samples": 1630, + "tasks": [], + "average_score": 0.4086167264646781 + }, + "Language Understanding and Generation": { + "count": 102, + "num_samples": 1713, + "tasks": [], + "average_score": 0.5122400421391089 + }, + "Domain-Specific Knowledge and Skills": { + "count": 46, + "num_samples": 897, + "tasks": [], + "average_score": 0.4655431430975485 + }, + "Spatial and Temporal Reasoning": { + "count": 78, + "num_samples": 1273, + "tasks": [], + "average_score": 0.3559690476405975 + }, + "Planning and Decision Making": { + "count": 23, + "num_samples": 356, + "tasks": [], + "average_score": 0.09741974015331743 + }, + "Commonsense and Social Reasoning": { + "count": 38, + "num_samples": 654, + "tasks": [], + "average_score": 0.5955368143490581 + }, + "Ethical and Safety Reasoning": { + "count": 10, + "num_samples": 170, + "tasks": [], + "average_score": 0.7948947368421052 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 67, + "num_samples": 1123, + "tasks": [], + "average_score": 0.3722082840493195 + }, + "Text-Based Images and Documents": { + "count": 53, + "num_samples": 847, + "tasks": [], + "average_score": 0.33052002642818507 + }, + "Photographs": { + "count": 83, + "num_samples": 1315, + "tasks": [], + "average_score": 0.5613400178213946 + }, + "Diagrams and Data Visualizations": { + "count": 88, + "num_samples": 1524, + "tasks": [], + "average_score": 0.46724590271207767 + }, + "Artistic and Creative Content": { + "count": 22, + "num_samples": 389, + "tasks": [], + "average_score": 0.5535202379362348 + }, + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.3348446026637953 + } + }, + "output_format": { + "structured_output": { + "count": 72, + "num_samples": 1121, + "tasks": [], + "average_score": 0.43823554216399857 + }, + "contextual_formatted_text": { + "count": 63, + "num_samples": 975, + "tasks": [], + "average_score": 0.3691249729531883 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.42013434507914493 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.43247267273235235 + }, + "open_ended_output": { + "count": 51, + "num_samples": 991, + "tasks": [], + "average_score": 0.5470781816319514 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.5905636451090996 + } + }, + "input_num": { + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.45400479933257654 + } + }, + "app": { + "Information_Extraction": { + "count": 41, + "num_samples": 644, + "tasks": [], + "average_score": 0.45245079667466714 + }, + "Planning": { + "count": 44, + "num_samples": 714, + "tasks": [], + "average_score": 0.21148887498941377 + }, + "Coding": { + "count": 16, + "num_samples": 244, + "tasks": [], + "average_score": 0.47487599206349207 + }, + "Perception": { + "count": 82, + "num_samples": 1321, + "tasks": [], + "average_score": 0.5468998820129136 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.4380952380952381 + }, + "Science": { + "count": 22, + "num_samples": 469, + "tasks": [], + "average_score": 0.48499051643275837 + }, + "Knowledge": { + "count": 77, + "num_samples": 1294, + "tasks": [], + "average_score": 0.5086518140501541 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.3853815223607656 + } + } + } +} \ No newline at end of file diff --git a/static/eval_results/SI/Gemini_1.5_flash_002/task_results.json b/static/eval_results/SI/Gemini_1.5_flash_002/task_results.json new file mode 100644 index 0000000000000000000000000000000000000000..acd24bb35a4b89f35e2ec3c316d485f0d30c39d1 --- /dev/null +++ b/static/eval_results/SI/Gemini_1.5_flash_002/task_results.json @@ -0,0 +1,4818 @@ +[ + { + "name": "monthly_weather_days_count", + "score": 0.3095238095238095, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_overlapped_circle", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_grid", + "score": 0.6785714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "webpage_code_understanding", + "score": 0.8888888888888888, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "transit_map_intersection_points", + "score": 0.4068877551020408, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_nested_squares", + "score": 0.4642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "scibench_fundamental_wo_solution", + "score": 0.3469387755102041, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Web_Single", + "score": 0.03886509470801488, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "remaining_playback_time_calculation", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_blocksworld", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "extract_webpage_headline", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "entertainment_web_game_style", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_find_odd_one_out", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "relative_reflectance_of_different_regions", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ascii_art_understanding", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "music_sheet_format_QA", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "recover_masked_word_in_figure", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Mobile_tap", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "mahjong", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "web_action_grounding", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_tyreworld", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "counting", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "interpret_force_perspective_illusion", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chinese_idiom_recognition", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "product_ocr_qa", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "medical_multi_organ_segmentation_rater", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "song_title_identification_from_lyrics", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "signage_navigation", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ishihara_test", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "orchestra_score_recognition", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_fit_pattern", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "distinguish_ai_generated_image", + "score": 0.8421052631578947, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "weather_map_climate_type_temperature_parsing", + "score": 0.6785714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "soccer_offside", + "score": 0.1111111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "icon_arithmetic_puzzle", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "road_map_find_highway_between_two_place", + "score": 0.6470588235294118, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Web_Multi", + "score": 0.22271751659129607, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "media_QA_web_stackoverflow", + "score": 0.5476190476190476, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "relative_depth_of_different_points", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "code_execution", + "score": 0.4375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "location_vqa", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "mensa_iq_test", + "score": 0.47990196078431374, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MMMU_pro_exam_screenshot", + "score": 0.2727272727272727, + "eval_type": "rule", + "num_demo": 1, + "num_query": 99, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "llavaguard", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_count_line_intersections", + "score": 0.32142857142857145, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_circled_letter", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "image_translation_en2cn", + "score": 0.3100359127375053, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "circuit_diagram_understanding", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "polygon_interior_angles", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_storage", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "font_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "autorater_artifact", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "knowledge_graph_understanding", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "CLEVRER_physics", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "highest_discount_game_price_identification", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "chess_find_legal_moves", + "score": 0.0319296239070534, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "autonomous_driving_scene_analysis", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "autorater_artifact_reason", + "score": 0.6, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "ocr_article_journal", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_equation", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_skill_plain", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_markdown", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_html", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_math_text_latex", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_latex", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_employer_plain", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_experience_plain", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "topological_sort", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_article_authors", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "hashtag_recommendation", + "score": 0.9404761904761905, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "healthcare_info_judgement", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Ethical and Safety Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "game_platform_support_identification", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "position_relationship", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "number_comparison", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "pictionary_genai_output_chinese", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "top_video_creator_identification", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_cartoon_drawing_guess", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "number_puzzle_kakuro_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_skribbl_io", + "score": 0.15, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "Ad_count_detection", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "web_action_prediction", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "number_puzzle_sudoku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "pictionary_doodle_guess", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "poetry_shakespearean_sonnet", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_csv", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_twitter", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "annoying_word_search", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Mobile_swipe", + "score": 0.49714178831993683, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "actor_recognition_in_Movie", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "flowchart_code_generation", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ocr_resume_school_plain", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "calendar_schedule_suggestion", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "table_understanding_fact_verification", + "score": 0.7261904761904762, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_ppt", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_homepage", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "play_go_capture_stone", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_blogpost", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "pictionary_chinese_food_img2en", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_publication", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "poetry_haiku", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_tiktok", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_clip_stable_diffusion_generate", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "scibench_calculus_wo_solution", + "score": 0.30612244897959184, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_alipay", + "score": 0.23529411764705882, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "counterfactual_arithmetic", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "3d_indoor_scene_text_bbox_prediction", + "score": 0.09826063389901919, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_youtube", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "dvqa", + "score": 0.9473684210526315, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "license_plate_recognition", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_background_change", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_math", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "super_clevr", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "geometry_analytic", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_attribute_edit", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_amazon", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "multilingual_movie_info_parsing", + "score": 0.5408163265306122, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "newspaper_ocr_in_query_box", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "poetry_custom_rhyming_scheme", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "insect_order_classification", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_word", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_out_of_context", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "long_string_number_recognition", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "coco_person_detection", + "score": 0.5916519873131821, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_checkmate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "vibe_eval_short_phrase", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "image_style_recognition", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "memorization_famous_treaty", + "score": 0.6785714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_swap", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_crushing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_instagram", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_chemistry", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_style", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_equality", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "music_info_parsing", + "score": 0.39285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_entity_replace", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "deciphering_oracle_bone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_zoom", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_TheoremQA", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "memorization_chinese_celebrity", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_iphone_settings", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "signboard_identification", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "graph_shortest_path_planar", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "Bongard_Problem", + "score": 0.2894736842105263, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "memorization_indian_celebrity", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_chordless_cycle", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "average_humidity_estimate_plot", + "score": 0.5600000000000002, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_leetcode", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "memorization_papers", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "long_string_letter_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "face_keypoint_detection", + "score": 0.746390336033466, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "math_breakpoint", + "score": 0.8666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "nlvr2_two_image_compare_qa", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "iconqa", + "score": 0.3157894736842105, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "exchange_rate_estimate_plot", + "score": 0.9621285714285712, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "math_parity", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "algebra", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "quizlet_question_solving", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "electricity_future_prediction_from_table", + "score": 0.7057894736842105, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_excel", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_biology", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "question_solution_solving", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "science_molecule_chemistry", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "graph_shortest_path_kamada_kawai", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "graph_isomorphism", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "geometry_length", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "brand_logo_recognition_and_elaboration", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "coco_object_detection_by_query_property", + "score": 0.5798723155227672, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "youtube_video_info_parsing", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "electricity_plot_future_prediction", + "score": 0.9017526315789473, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "figureqa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "stock_info_parsing", + "score": 0.7478991596638657, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "movie_info_parsing", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "landmark_recognition_and_qa", + "score": 0.5555555555555555, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "widerface_face_count_and_event_classification", + "score": 0.5357142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_transformation", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "math_convexity_value_estimation", + "score": 0.570486129111546, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "stock_price_future_prediction", + "score": 0.7672857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multilingual_game_info_parsing", + "score": 0.4642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_descriptive", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "physical_property_reasoning", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "mnist_pattern", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_americas", + "score": 0.21428571428571433, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "traffic_future_prediction_from_line_plot", + "score": 0.7220526315789474, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "single_person_pose_estimation", + "score": 0.24564101770091742, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chart_vqa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "newspaper_page_parse_and_count", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_connectivity", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "symbolic_graphics_programs_scalable_vector_graphics", + "score": 0.1111111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "map_diagram_qa", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "graph_maxflow", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "symbolic_graphics_programs_computer_aided_design", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "places365_scene_type_classification", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geometry_solid", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_relation", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ili_ratio_future_prediction", + "score": 0.11578571428571437, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "game_info_parsing", + "score": 0.7727272727272726, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "top_rated_hotel_identification", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_distance", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pmc_vqa_medical_image_qa", + "score": 0.5263157894736842, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_count", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_depth", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "human_relationship_reasoning", + "score": 0.9375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Commonsense and Social Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "humor_understand_caption_match", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chess_puzzle_single_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "weather_info_parsing", + "score": 0.7539682539682538, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "egocentric_spatial_reasoning", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "snli_ve_visual_entailment", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "electricity_load_estimate_plot", + "score": 0.589357142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "code_programming_test_easy", + "score": 0.1875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 24, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "famous_building_recognition", + "score": 0.84375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_winner_identification", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "hotel_booking_confirmation_parsing", + "score": 0.3928571428571428, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_area", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_africa", + "score": 0.19999999999999998, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_physics", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_europe", + "score": 0.15714285714285717, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_asia", + "score": 0.4714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "science_basic_physics", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "animal_pose_estimation", + "score": 0.24492301011444534, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "mvsa_sentiment_classification", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "egocentric_analysis_single_image", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "TV_show_info_parsing", + "score": 0.753968253968254, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "mindmap_elements_parsing", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "rebus", + "score": 0.30434782608695654, + "eval_type": "rule", + "num_demo": 1, + "num_query": 23, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "paper_vqa", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "poetry_petrarchian_sonnet_optional_meter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic_alliteration", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_limerick", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_length", + "score": 0.4, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_contain_length", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "MMSoc_HatefulMemes", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_repeat_length", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_position_length", + "score": 0.8, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "graph_theory", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chess_sygyzy_endgames", + "score": 0.07619047619047618, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "medical_counting_lymphocytes", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "LaTeX_complex_formula_convertion", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_image_artifacts_indentification", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_grippers", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_visual_storage", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "music_sheet_note_count", + "score": 0.058823529411764705, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "medical_cell_recognition", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "panel_images_single_question", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "panel_images_multi_question", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "crossword_mini_5x5", + "score": 0.35000000000000003, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "maze_2d_8x8", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "table_understanding_complex_question_answering", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "reward_models_I2T_reward", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "shape_composition_shapes", + "score": 0.3137755102040816, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "shape_composition_colours", + "score": 0.2828798185941043, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_PolitiFact", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Memotion", + "score": 0.6000000000000001, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_veracity", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_GossipCop", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Object Recognition and Classification", + "Scene and Event Understanding", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "waybill_number_sequence_extraction", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_MATH", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "funsd_document_qa", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "medical_blood_vessels_recognition", + "score": 0.6785714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "cultural_vqa", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "character_recognition_in_TV_shows", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "kvqa_knowledge_aware_qa", + "score": 0.47368421052631576, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "tqa_textbook_qa", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "arxiv_vqa", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "realworld_qa_en2cn", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MMMU_physics_chemistry_MCQ", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "handwritten_math_expression_extraction", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "3d_indoor_scene_text_bbox_selection", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_notes", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "table_understanding", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "ascii_art_30", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "table2latex_complex", + "score": 0.7666666666666668, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "meme_explain", + "score": 0.8571428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funny_image_title", + "score": 0.5928571428571429, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "guess_image_generation_prompt", + "score": 0.8263157894736842, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "traffic_accident_analysis", + "score": 0.6214285714285716, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "scibench_w_solution_open_ended", + "score": 0.45000000000000007, + "eval_type": "llm", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "vibe-eval", + "score": 0.6071428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "image_humor_understanding", + "score": 0.9068965517241379, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "docci_image_description_long", + "score": 0.7642857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "humor_explanation", + "score": 0.8533333333333335, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "graph_interpretation", + "score": 0.789655172413793, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "science_figure_explanation", + "score": 0.8551724137931035, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "ocrqa", + "score": 0.7758620689655171, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bar_chart_interpretation", + "score": 0.6310344827586206, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Spanish", + "score": 0.3071428571428572, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "figurative_speech_explanation", + "score": 0.8137931034482758, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_English", + "score": 0.1857142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Russian", + "score": 0.22142857142857145, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Arabic", + "score": 0.29999999999999993, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "defeasible_reasoning", + "score": 0.8448275862068967, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Japanese", + "score": 0.33571428571428574, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "image_captioning_with_additional_requirements", + "score": 0.8285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_French", + "score": 0.2785714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "visualization_with_code", + "score": 0.5714285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "table_understanding_fetaqa", + "score": 0.47857142857142865, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_jailbreak", + "score": 0.8300000000000001, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Text Recognition (OCR)", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_celebrity", + "score": 0.8500000000000002, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "GUI_Chat_Easy", + "score": 0.6884615384615385, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "GUI_Chat_Hard", + "score": 0.3806451612903227, + "eval_type": "llm", + "num_demo": 1, + "num_query": 31, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_politics", + "score": 0.705, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "electrocardiogram", + "score": 0.24285714285714285, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "tweets_captioning", + "score": 0.5214285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_captcha", + "score": 0.10000000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_racial", + "score": 0.7850000000000004, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visualmisleading", + "score": 0.8789473684210528, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_worldclass", + "score": 0.15, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_expert", + "score": 0.32142857142857134, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "iq_test", + "score": 0.6482758620689654, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "unusual_images", + "score": 0.8689655172413793, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "art_explanation", + "score": 0.7068965517241379, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_advanced", + "score": 0.1642857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + } +] \ No newline at end of file diff --git a/static/eval_results/SI/Gemini_1.5_pro_002/summary_results.json b/static/eval_results/SI/Gemini_1.5_pro_002/summary_results.json new file mode 100644 index 0000000000000000000000000000000000000000..b782cc9ac005a5507ee5cb19b6ce2a0a0098b4ca --- /dev/null +++ b/static/eval_results/SI/Gemini_1.5_pro_002/summary_results.json @@ -0,0 +1,215 @@ +{ + "model_summary": { + "core": { + "num_eval_tasks": 273, + "num_eval_samples": 4116, + "macro_mean_score": 0.4914311038229404, + "micro_mean_score": 0.48323615160349853 + }, + "open": { + "num_eval_tasks": 42, + "num_eval_samples": 813, + "macro_mean_score": 0.5814975405131552, + "micro_mean_score": 0.6174661746617466 + }, + "overall_score": 0.5034399620483024 + }, + "keyword_stats": { + "skills": { + "Text Recognition (OCR)": { + "count": 101, + "num_samples": 1687, + "tasks": [], + "average_score": 0.5000257619938475 + }, + "Object Recognition and Classification": { + "count": 172, + "num_samples": 2714, + "tasks": [], + "average_score": 0.5220033468415737 + }, + "Scene and Event Understanding": { + "count": 60, + "num_samples": 1004, + "tasks": [], + "average_score": 0.6342882147970302 + }, + "Mathematical and Logical Reasoning": { + "count": 91, + "num_samples": 1630, + "tasks": [], + "average_score": 0.448099634405986 + }, + "Language Understanding and Generation": { + "count": 102, + "num_samples": 1713, + "tasks": [], + "average_score": 0.5647567827649111 + }, + "Domain-Specific Knowledge and Skills": { + "count": 46, + "num_samples": 897, + "tasks": [], + "average_score": 0.5090111123207751 + }, + "Spatial and Temporal Reasoning": { + "count": 78, + "num_samples": 1273, + "tasks": [], + "average_score": 0.3972807544005462 + }, + "Planning and Decision Making": { + "count": 23, + "num_samples": 356, + "tasks": [], + "average_score": 0.178032259819607 + }, + "Commonsense and Social Reasoning": { + "count": 38, + "num_samples": 654, + "tasks": [], + "average_score": 0.5995804836966744 + }, + "Ethical and Safety Reasoning": { + "count": 10, + "num_samples": 170, + "tasks": [], + "average_score": 0.7830639097744362 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 67, + "num_samples": 1123, + "tasks": [], + "average_score": 0.42724302639929596 + }, + "Text-Based Images and Documents": { + "count": 53, + "num_samples": 847, + "tasks": [], + "average_score": 0.4060403716629095 + }, + "Photographs": { + "count": 83, + "num_samples": 1315, + "tasks": [], + "average_score": 0.5888558035357285 + }, + "Diagrams and Data Visualizations": { + "count": 88, + "num_samples": 1524, + "tasks": [], + "average_score": 0.5132563067393096 + }, + "Artistic and Creative Content": { + "count": 22, + "num_samples": 389, + "tasks": [], + "average_score": 0.6217290675275775 + }, + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.3592697030984118 + } + }, + "output_format": { + "structured_output": { + "count": 72, + "num_samples": 1121, + "tasks": [], + "average_score": 0.4972242280053817 + }, + "contextual_formatted_text": { + "count": 63, + "num_samples": 975, + "tasks": [], + "average_score": 0.43754003302746525 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.4731762319443037 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.48334866543174226 + }, + "open_ended_output": { + "count": 51, + "num_samples": 991, + "tasks": [], + "average_score": 0.5644701189535662 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.6245091608727974 + } + }, + "input_num": { + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.5034399620483027 + } + }, + "app": { + "Information_Extraction": { + "count": 41, + "num_samples": 644, + "tasks": [], + "average_score": 0.504539358390968 + }, + "Planning": { + "count": 44, + "num_samples": 714, + "tasks": [], + "average_score": 0.28536490696494377 + }, + "Coding": { + "count": 16, + "num_samples": 244, + "tasks": [], + "average_score": 0.48587549603174607 + }, + "Perception": { + "count": 82, + "num_samples": 1321, + "tasks": [], + "average_score": 0.5964613809728712 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.48888888888888893 + }, + "Science": { + "count": 22, + "num_samples": 469, + "tasks": [], + "average_score": 0.500158537824293 + }, + "Knowledge": { + "count": 77, + "num_samples": 1294, + "tasks": [], + "average_score": 0.5660366627264668 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.4200866579901879 + } + } + } +} \ No newline at end of file diff --git a/static/eval_results/SI/Gemini_1.5_pro_002/task_results.json b/static/eval_results/SI/Gemini_1.5_pro_002/task_results.json new file mode 100644 index 0000000000000000000000000000000000000000..f197c4416bb4dd920f70a2abce6c2201fbb412d2 --- /dev/null +++ b/static/eval_results/SI/Gemini_1.5_pro_002/task_results.json @@ -0,0 +1,4818 @@ +[ + { + "name": "GUI_Act_Mobile_swipe", + "score": 0.6199454600186646, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "road_map_find_highway_between_two_place", + "score": 0.6470588235294118, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_cell_recognition", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "counting", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "position_relationship", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "music_sheet_note_count", + "score": 0.11764705882352941, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "font_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "Bongard_Problem", + "score": 0.15789473684210525, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "Ad_count_detection", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_find_odd_one_out", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "flowchart_code_generation", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_fit_pattern", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "song_title_identification_from_lyrics", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "relative_depth_of_different_points", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "autonomous_driving_scene_analysis", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_count_line_intersections", + "score": 0.39285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "media_QA_web_stackoverflow", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "llavaguard", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "knowledge_graph_understanding", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "image_translation_en2cn", + "score": 0.4119942575491687, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "CLEVRER_physics", + "score": 0.45, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "recover_masked_word_in_figure", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MMMU_pro_exam_screenshot", + "score": 0.29292929292929293, + "eval_type": "rule", + "num_demo": 1, + "num_query": 99, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_overlapped_circle", + "score": 0.6071428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_image_artifacts_indentification", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "monthly_weather_days_count", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Mobile_tap", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "paper_vqa", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "product_ocr_qa", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ishihara_test", + "score": 0.31428571428571433, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_grid", + "score": 0.8214285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_visual_storage", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "location_vqa", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_nested_squares", + "score": 0.32142857142857145, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Web_Multi", + "score": 0.4699566675933124, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_circled_letter", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "transit_map_intersection_points", + "score": 0.33035714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "mensa_iq_test", + "score": 0.4656862745098039, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "game_platform_support_identification", + "score": 0.9642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "webpage_code_understanding", + "score": 0.7777777777777778, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "healthcare_info_judgement", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Ethical and Safety Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "chess_find_legal_moves", + "score": 0.06762834530316385, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "play_go_capture_stone", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "medical_multi_organ_segmentation_rater", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "signage_navigation", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "circuit_diagram_understanding", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "polygon_interior_angles", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "topological_sort", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_grippers", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "mindmap_elements_parsing", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chinese_idiom_recognition", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "top_video_creator_identification", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "interpret_force_perspective_illusion", + "score": 0.8666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_storage", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "annoying_word_search", + "score": 0.0035714285714285718, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "scibench_calculus_wo_solution", + "score": 0.3877551020408163, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_tyreworld", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "stock_info_parsing", + "score": 0.8025210084033615, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "mahjong", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "stock_price_future_prediction", + "score": 0.831857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "chess_sygyzy_endgames", + "score": 0.09619047619047619, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "rebus", + "score": 0.391304347826087, + "eval_type": "rule", + "num_demo": 1, + "num_query": 23, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "geometry_length", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ascii_art_understanding", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "entertainment_web_game_style", + "score": 0.8214285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Web_Single", + "score": 0.03864007436439077, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "traffic_future_prediction_from_line_plot", + "score": 0.6723157894736841, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_biology", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "LaTeX_complex_formula_convertion", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "icon_arithmetic_puzzle", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "scibench_fundamental_wo_solution", + "score": 0.30612244897959184, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "medical_blood_vessels_recognition", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_physics", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "orchestra_score_recognition", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "web_action_grounding", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "animal_pose_estimation", + "score": 0.30454267975765786, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "multilingual_movie_info_parsing", + "score": 0.5510204081632654, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "calendar_schedule_suggestion", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "average_humidity_estimate_plot", + "score": 0.668, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "electricity_load_estimate_plot", + "score": 0.7153571428571428, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "web_action_prediction", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "pmc_vqa_medical_image_qa", + "score": 0.631578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "highest_discount_game_price_identification", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "music_info_parsing", + "score": 0.48214285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "remaining_playback_time_calculation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "nlvr2_two_image_compare_qa", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "movie_info_parsing", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "weather_map_climate_type_temperature_parsing", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_transformation", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_execution", + "score": 0.4375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "music_sheet_format_QA", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "multilingual_game_info_parsing", + "score": 0.48214285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "egocentric_analysis_single_image", + "score": 0.7777777777777778, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "code_programming_test_easy", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 24, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "landmark_recognition_and_qa", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "actor_recognition_in_Movie", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "extract_webpage_headline", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "hashtag_recommendation", + "score": 0.9523809523809523, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_connectivity", + "score": 0.8666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "distinguish_ai_generated_image", + "score": 0.6842105263157895, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "relative_reflectance_of_different_regions", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_counting_lymphocytes", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "youtube_video_info_parsing", + "score": 0.5238095238095237, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "physical_property_reasoning", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "insect_order_classification", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_blocksworld", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "soccer_offside", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "famous_building_recognition", + "score": 0.84375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_shortest_path_kamada_kawai", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "license_plate_recognition", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "3d_indoor_scene_text_bbox_prediction", + "score": 0.14711083476825218, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "number_comparison", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "top_rated_hotel_identification", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "TV_show_info_parsing", + "score": 0.7460317460317462, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "science_basic_physics", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "image_style_recognition", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "widerface_face_count_and_event_classification", + "score": 0.4642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "face_keypoint_detection", + "score": 0.6758816417011395, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "math_parity", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "snli_ve_visual_entailment", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "graph_theory", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "figureqa", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "human_relationship_reasoning", + "score": 0.9375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Commonsense and Social Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "brand_logo_recognition_and_elaboration", + "score": 0.82, + "eval_type": "rule", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "electricity_plot_future_prediction", + "score": 0.8486368421052632, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "chess_winner_identification", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "algebra", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "mnist_pattern", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "super_clevr", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "graph_isomorphism", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "science_molecule_chemistry", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "math_convexity_value_estimation", + "score": 0.555696767990635, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_chemistry", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "map_diagram_qa", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "deciphering_oracle_bone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "graph_shortest_path_planar", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "funsd_document_qa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "iconqa", + "score": 0.42105263157894735, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "newspaper_page_parse_and_count", + "score": 0.4666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "electricity_future_prediction_from_table", + "score": 0.7089473684210525, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "humor_understand_caption_match", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "weather_info_parsing", + "score": 0.8174603174603176, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "vibe_eval_short_phrase", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "hotel_booking_confirmation_parsing", + "score": 0.3857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "signboard_identification", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "game_info_parsing", + "score": 0.8246753246753247, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_descriptive", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "coco_person_detection", + "score": 0.66869355335515, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "chess_puzzle_single_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geometry_analytic", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "exchange_rate_estimate_plot", + "score": 0.9712357142857144, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "long_string_number_recognition", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "waybill_number_sequence_extraction", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "graph_maxflow", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "egocentric_spatial_reasoning", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "graph_chordless_cycle", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "places365_scene_type_classification", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_blogpost", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_publication", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_homepage", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "reward_models_I2T_reward", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "single_person_pose_estimation", + "score": 0.32509082865144884, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "mvsa_sentiment_classification", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "math_breakpoint", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "ili_ratio_future_prediction", + "score": 0.2693571428571428, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "quizlet_question_solving", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "dvqa", + "score": 0.7368421052631579, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "chart_vqa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "geometry_solid", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_repeat_length", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_crushing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "long_string_letter_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "table_understanding_fact_verification", + "score": 0.8333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "question_solution_solving", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "table_understanding_complex_question_answering", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_length", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "number_puzzle_sudoku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_position_length", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "crossword_mini_5x5", + "score": 0.7357142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "number_puzzle_kakuro_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_chinese_food_img2en", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_skribbl_io", + "score": 0.35, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_genai_output_chinese", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_doodle_guess", + "score": 0.8666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_contain_length", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "pictionary_cartoon_drawing_guess", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geometry_area", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_swap", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "autorater_artifact_reason", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_veracity", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "panel_images_single_question", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_out_of_context", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "panel_images_multi_question", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_background_change", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_checkmate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_clip_stable_diffusion_generate", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "newspaper_ocr_in_query_box", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_style", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_equality", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_entity_replace", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ocr_resume_skill_plain", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_attribute_edit", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ocr_math_equation", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_latex", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_experience_plain", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "autorater_artifact", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ocr_resume_employer_plain", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "counterfactual_arithmetic", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "ocr_math_text_latex", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic_alliteration", + "score": 0.4, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_markdown", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_school_plain", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "poetry_shakespearean_sonnet", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_csv", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "poetry_haiku", + "score": 0.8666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "memorization_indian_celebrity", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "poetry_petrarchian_sonnet_optional_meter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "memorization_chinese_celebrity", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_alipay", + "score": 0.35294117647058826, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic", + "score": 0.8, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "memorization_famous_treaty", + "score": 0.75, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_zoom", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_math", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "poetry_limerick", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "memorization_papers", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_word", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "poetry_custom_rhyming_scheme", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_tiktok", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "shape_composition_shapes", + "score": 0.36734693877551017, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_math_MATH", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "maze_2d_8x8", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "shape_composition_colours", + "score": 0.40232426303854874, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_article_authors", + "score": 0.6071428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_math_TheoremQA", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MMSoc_HatefulMemes", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_html", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_PolitiFact", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "ocr_article_journal", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_GossipCop", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Object Recognition and Classification", + "Scene and Event Understanding", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Memotion", + "score": 0.6117647058823531, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_leetcode", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_instagram", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_iphone_settings", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_ppt", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "coco_object_detection_by_query_property", + "score": 0.5945319390969315, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_amazon", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_excel", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_youtube", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_twitter", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_relation", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_distance", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_depth", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_count", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "symbolic_graphics_programs_scalable_vector_graphics", + "score": 0.16666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "symbolic_graphics_programs_computer_aided_design", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_africa", + "score": 0.4142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_europe", + "score": 0.5142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_asia", + "score": 0.6285714285714287, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_americas", + "score": 0.3714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "cultural_vqa", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "character_recognition_in_TV_shows", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "kvqa_knowledge_aware_qa", + "score": 0.3157894736842105, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "tqa_textbook_qa", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "arxiv_vqa", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "realworld_qa_en2cn", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MMMU_physics_chemistry_MCQ", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "handwritten_math_expression_extraction", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "3d_indoor_scene_text_bbox_selection", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_notes", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "table_understanding", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "ascii_art_30", + "score": 0.14285714285714285, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "funny_image_title", + "score": 0.5714285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "table2latex_complex", + "score": 0.7111111111111111, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "meme_explain", + "score": 0.9, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "guess_image_generation_prompt", + "score": 0.8263157894736843, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "traffic_accident_analysis", + "score": 0.5357142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "scibench_w_solution_open_ended", + "score": 0.48, + "eval_type": "llm", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "vibe-eval", + "score": 0.692857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "image_humor_understanding", + "score": 0.8896551724137929, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "docci_image_description_long", + "score": 0.7571428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "humor_explanation", + "score": 0.8, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "graph_interpretation", + "score": 0.8241379310344826, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "science_figure_explanation", + "score": 0.8206896551724137, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "ocrqa", + "score": 0.8103448275862067, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "GUI_Chat_Hard", + "score": 0.42580645161290337, + "eval_type": "llm", + "num_demo": 1, + "num_query": 31, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "iq_test", + "score": 0.6, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "unusual_images", + "score": 0.8758620689655172, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "tweets_captioning", + "score": 0.5714285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "art_explanation", + "score": 0.6655172413793102, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bar_chart_interpretation", + "score": 0.6000000000000001, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "figurative_speech_explanation", + "score": 0.8137931034482755, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "defeasible_reasoning", + "score": 0.7965517241379312, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "image_captioning_with_additional_requirements", + "score": 0.8857142857142859, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "visualization_with_code", + "score": 0.5142857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "table_understanding_fetaqa", + "score": 0.5285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_jailbreak", + "score": 0.82, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Text Recognition (OCR)", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_celebrity", + "score": 0.7750000000000001, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_captcha", + "score": 0.10000000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_politics", + "score": 0.74, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "electrocardiogram", + "score": 0.2928571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "GUI_Chat_Easy", + "score": 0.6346153846153848, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_racial", + "score": 0.765, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visualmisleading", + "score": 0.8842105263157897, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_worldclass", + "score": 0.21428571428571433, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Spanish", + "score": 0.35000000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_advanced", + "score": 0.22142857142857145, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_English", + "score": 0.15000000000000005, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_expert", + "score": 0.3428571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Russian", + "score": 0.22142857142857147, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Japanese", + "score": 0.3000000000000001, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Arabic", + "score": 0.23571428571428577, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_French", + "score": 0.33571428571428574, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + } +] \ No newline at end of file diff --git a/static/eval_results/SI/Idefics3/summary_results.json b/static/eval_results/SI/Idefics3/summary_results.json new file mode 100644 index 0000000000000000000000000000000000000000..486dce3311f80e350c2765b963dfc7581e29f78f --- /dev/null +++ b/static/eval_results/SI/Idefics3/summary_results.json @@ -0,0 +1,219 @@ +{ + "model_summary": { + "core": { + "num_eval_tasks": 273, + "num_eval_samples": 4116, + "num_not_eval_samples": 0, + "macro_mean_score": 0.08941182847569326, + "micro_mean_score": 0.08779475233900695, + "missing_tasks": [] + }, + "open": { + "num_eval_tasks": 42, + "num_eval_samples": 813, + "num_not_eval_samples": 0, + "macro_mean_score": 0.3231434267517844, + "micro_mean_score": 0.3618081180811809, + "missing_tasks": [] + }, + "overall_score": 0.12057604157917208 + }, + "keyword_stats": { + "skills": { + "Text Recognition (OCR)": { + "count": 101, + "num_samples": 1687, + "tasks": [], + "average_score": 0.07893017100109866 + }, + "Object Recognition and Classification": { + "count": 172, + "num_samples": 2714, + "tasks": [], + "average_score": 0.12579260798514427 + }, + "Scene and Event Understanding": { + "count": 60, + "num_samples": 1004, + "tasks": [], + "average_score": 0.15897902615904647 + }, + "Mathematical and Logical Reasoning": { + "count": 91, + "num_samples": 1630, + "tasks": [], + "average_score": 0.1275512898313342 + }, + "Language Understanding and Generation": { + "count": 102, + "num_samples": 1713, + "tasks": [], + "average_score": 0.1724799353848912 + }, + "Domain-Specific Knowledge and Skills": { + "count": 46, + "num_samples": 897, + "tasks": [], + "average_score": 0.1166739111764397 + }, + "Spatial and Temporal Reasoning": { + "count": 78, + "num_samples": 1273, + "tasks": [], + "average_score": 0.09276606649010487 + }, + "Planning and Decision Making": { + "count": 23, + "num_samples": 356, + "tasks": [], + "average_score": 0.014803312629399587 + }, + "Commonsense and Social Reasoning": { + "count": 38, + "num_samples": 654, + "tasks": [], + "average_score": 0.2126465842330819 + }, + "Ethical and Safety Reasoning": { + "count": 10, + "num_samples": 170, + "tasks": [], + "average_score": 0.2774436090225564 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 67, + "num_samples": 1123, + "tasks": [], + "average_score": 0.03857183991826921 + }, + "Text-Based Images and Documents": { + "count": 53, + "num_samples": 847, + "tasks": [], + "average_score": 0.06561871098996794 + }, + "Photographs": { + "count": 83, + "num_samples": 1315, + "tasks": [], + "average_score": 0.171712228743858 + }, + "Diagrams and Data Visualizations": { + "count": 88, + "num_samples": 1524, + "tasks": [], + "average_score": 0.14766910173600153 + }, + "Artistic and Creative Content": { + "count": 22, + "num_samples": 389, + "tasks": [], + "average_score": 0.21050154891192577 + }, + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.020659062938075456 + } + }, + "output_format": { + "structured_output": { + "count": 72, + "num_samples": 1121, + "tasks": [], + "average_score": 0.02100010342704044 + }, + "contextual_formatted_text": { + "count": 63, + "num_samples": 975, + "tasks": [], + "average_score": 0.15091196450213815 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.053829016986911726 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.10744987600153451 + }, + "open_ended_output": { + "count": 51, + "num_samples": 991, + "tasks": [], + "average_score": 0.2975217887286715 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.13726004635095543 + } + }, + "input_num": { + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.12057604157917215 + } + }, + "app": { + "Information_Extraction": { + "count": 41, + "num_samples": 644, + "tasks": [], + "average_score": 0.03610947192711297 + }, + "Planning": { + "count": 44, + "num_samples": 714, + "tasks": [], + "average_score": 0.04525221984520586 + }, + "Coding": { + "count": 16, + "num_samples": 244, + "tasks": [], + "average_score": 0.10420386904761905 + }, + "Perception": { + "count": 82, + "num_samples": 1321, + "tasks": [], + "average_score": 0.17708549842279478 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.19999999999999998 + }, + "Science": { + "count": 22, + "num_samples": 469, + "tasks": [], + "average_score": 0.1804888391778344 + }, + "Knowledge": { + "count": 77, + "num_samples": 1294, + "tasks": [], + "average_score": 0.14759816564804443 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.07952603609985566 + } + } + } +} \ No newline at end of file diff --git a/static/eval_results/SI/Idefics3/task_results.json b/static/eval_results/SI/Idefics3/task_results.json new file mode 100644 index 0000000000000000000000000000000000000000..866de33ec0c590e428ff19cdab156621b8786256 --- /dev/null +++ b/static/eval_results/SI/Idefics3/task_results.json @@ -0,0 +1,2207 @@ +[ + { + "name": "location_vqa", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "scibench_calculus_wo_solution", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49 + }, + { + "name": "logical_reasoning_find_odd_one_out", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "relative_depth_of_different_points", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "signage_navigation", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "road_map_find_highway_between_two_place", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "logical_reasoning_fit_pattern", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "entertainment_web_game_style", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "game_platform_support_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ascii_art_understanding", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "scibench_fundamental_wo_solution", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49 + }, + { + "name": "GUI_Act_Mobile_swipe", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "extract_webpage_headline", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "distinguish_ai_generated_image", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "geometry_reasoning_nested_squares", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_cell_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_image_artifacts_indentification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_tyreworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "geometry_reasoning_overlapped_circle", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "monthly_weather_days_count", + "score": 0.11904761904761904, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "CLEVRER_physics", + "score": 0.05, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20 + }, + { + "name": "GUI_Act_Mobile_tap", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "character_recognition_in_TV_shows", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "circuit_diagram_understanding", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "music_sheet_note_count", + "score": 0.058823529411764705, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "MMMU_physics_chemistry_MCQ", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_find_legal_moves", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_visual_storage", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "medical_counting_lymphocytes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "code_execution", + "score": 0.0625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "planning_screenshot_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "chess_sygyzy_endgames", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cultural_vqa", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "hashtag_recommendation", + "score": 0.4833333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_grid", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "annoying_word_search", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "healthcare_info_judgement", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "webpage_code_understanding", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "font_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "relative_reflectance_of_different_regions", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "calendar_schedule_suggestion", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mensa_iq_test", + "score": 0.27647058823529413, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "medical_multi_organ_segmentation_rater", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "product_ocr_qa", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "counting", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_blocksworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "MMMU_pro_exam_screenshot", + "score": 0.10101010101010101, + "eval_type": "rule", + "num_demo": 1, + "num_query": 99 + }, + { + "name": "chinese_idiom_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ishihara_test", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "Ad_count_detection", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "number_comparison", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "song_title_identification_from_lyrics", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "paper_vqa", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "polygon_interior_angles", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "play_go_capture_stone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "mindmap_elements_parsing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_storage", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "rebus", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 23 + }, + { + "name": "music_sheet_format_QA", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "autonomous_driving_scene_analysis", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "transit_map_intersection_points", + "score": 0.004464285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mahjong", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "LaTeX_complex_formula_convertion", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "GUI_Act_Web_Single", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "image_translation_en2cn", + "score": 0.0836931120056328, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "actor_recognition_in_Movie", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_blood_vessels_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "remaining_playback_time_calculation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "realworld_qa_en2cn", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "soccer_offside", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "recover_masked_word_in_figure", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "top_video_creator_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "web_action_grounding", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "arxiv_vqa", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "interpret_force_perspective_illusion", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "orchestra_score_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "GUI_Act_Web_Multi", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "llavaguard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "icon_arithmetic_puzzle", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_count_line_intersections", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "web_action_prediction", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "knowledge_graph_understanding", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "position_relationship", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "Bongard_Problem", + "score": 0.15789473684210525, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "geometry_reasoning_circled_letter", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "flowchart_code_generation", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "topological_sort", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_grippers", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "weather_map_climate_type_temperature_parsing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "highest_discount_game_price_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "media_QA_web_stackoverflow", + "score": 0.047619047619047616, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "newspaper_ocr_in_query_box", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "electricity_plot_future_prediction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "coco_person_detection", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_length", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "weather_info_parsing", + "score": 0.02777777777777778, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "physical_property_reasoning", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "face_keypoint_detection", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_maxflow", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "geometry_transformation", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_puzzle_single_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "ili_ratio_future_prediction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_analytic", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mvsa_sentiment_classification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_theory", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "3d_indoor_scene_text_bbox_selection", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "funsd_document_qa", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "single_person_pose_estimation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "deciphering_oracle_bone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "license_plate_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "science_molecule_chemistry", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "snli_ve_visual_entailment", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "table_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_solid", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "tqa_textbook_qa", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "music_info_parsing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_area", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "dvqa", + "score": 0.42105263157894735, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "widerface_face_count_and_event_classification", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "long_string_letter_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_winner_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "math_convexity_value_estimation", + "score": 0.07375100780769979, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "ti_fused_vqa_math", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "human_relationship_reasoning", + "score": 0.5625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "vibe_eval_short_phrase", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "question_solution_solving", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "game_info_parsing", + "score": 0.025974025974025976, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "top_rated_hotel_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "stock_price_future_prediction", + "score": 0.0659285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "algebra", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_shortest_path_planar", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "map_diagram_qa", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_shortest_path_kamada_kawai", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_chordless_cycle", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "waybill_number_sequence_extraction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "iconqa", + "score": 0.5263157894736842, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "graph_connectivity", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "mnist_pattern", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "egocentric_analysis_single_image", + "score": 0.1111111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "image_style_recognition", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "geometry_descriptive", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "electricity_load_estimate_plot", + "score": 0.40599999999999997, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "insect_order_classification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "ti_fused_vqa_biology", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "youtube_video_info_parsing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "quizlet_question_solving", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ti_fused_vqa_chemistry", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "exchange_rate_estimate_plot", + "score": 0.7581428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multilingual_movie_info_parsing", + "score": 0.030612244897959183, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "hotel_booking_confirmation_parsing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "code_programming_test_easy", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 24 + }, + { + "name": "brand_logo_recognition_and_elaboration", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 25 + }, + { + "name": "movie_info_parsing", + "score": 0.017857142857142856, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "3d_indoor_scene_text_bbox_prediction", + "score": 0.04131812587615091, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "TV_show_info_parsing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "stock_info_parsing", + "score": 0.008403361344537815, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "math_parity", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "famous_building_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "egocentric_spatial_reasoning", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "nlvr2_two_image_compare_qa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_isomorphism", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "landmark_recognition_and_qa", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "handwritten_math_expression_extraction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ti_fused_vqa_physics", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pmc_vqa_medical_image_qa", + "score": 0.5263157894736842, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "long_string_number_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "electricity_future_prediction_from_table", + "score": 0.14842105263157898, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "places365_scene_type_classification", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "super_clevr", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "animal_pose_estimation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "science_basic_physics", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "kvqa_knowledge_aware_qa", + "score": 0.21052631578947367, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "math_breakpoint", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "newspaper_page_parse_and_count", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "multilingual_game_info_parsing", + "score": 0.008928571428571428, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "signboard_identification", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "chart_vqa", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "average_humidity_estimate_plot", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "humor_understand_caption_match", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "figureqa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "coco_object_detection_by_query_property", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "traffic_future_prediction_from_line_plot", + "score": 0.07094736842105263, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "chess_puzzles_crushing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_puzzles_equality", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "chess_puzzles_checkmate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_papers", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "memorization_indian_celebrity", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_chinese_celebrity", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_famous_treaty", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "reward_models_I2T_reward", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "crossword_mini_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "number_puzzle_sudoku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "number_puzzle_kakuro_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "panel_images_single_question", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "panel_images_multi_question", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_complex_question_answering", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_fact_verification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "constrained_generation_contain_repeat_length", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_contain_length", + "score": 0.8, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_length", + "score": 0.4, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_position_length", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "maze_2d_8x8", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Misinformation_PolitiFact", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Misinformation_GossipCop", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_HatefulMemes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Memotion", + "score": 0.09411764705882353, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "ocr_table_to_latex", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_csv", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_markdown", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_experience_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_employer_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_article_authors", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_article_journal", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_skill_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_text_latex", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_equation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_html", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_school_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_doodle_guess", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "pictionary_chinese_food_img2en", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_cartoon_drawing_guess", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_genai_output_chinese", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_skribbl_io", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20 + }, + { + "name": "counterfactual_arithmetic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_youtube", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_iphone_settings", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_notes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_word", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_alipay", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "app_layout_understanding_twitter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_leetcode", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_tiktok", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_excel", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_instagram", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_ppt", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_zoom", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "app_layout_understanding_amazon", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_face_attribute_edit", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_out_of_context", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_background_change", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_text_entity_replace", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_clip_stable_diffusion_generate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_text_style", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_veracity", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_face_swap", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_blogpost", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_homepage", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_publication", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "autorater_artifact_reason", + "score": 0.6, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "autorater_artifact", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "shape_composition_shapes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "shape_composition_colours", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_TheoremQA", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_MATH", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "poetry_petrarchian_sonnet_optional_meter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_shakespearean_sonnet", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_custom_rhyming_scheme", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_haiku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_acrostic_alliteration", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_limerick", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_acrostic", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "symbolic_graphics_programs_computer_aided_design", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "symbolic_graphics_programs_scalable_vector_graphics", + "score": 0.1111111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18 + }, + { + "name": "cvbench_adapted_cvbench_depth", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_relation", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_distance", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_count", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_africa", + "score": 0.014285714285714287, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_asia", + "score": 0.042857142857142864, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_americas", + "score": 0.05714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_europe", + "score": 0.05714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "figurative_speech_explanation", + "score": 0.6034482758620691, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "tweets_captioning", + "score": 0.41428571428571426, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "science_figure_explanation", + "score": 0.3448275862068966, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "iq_test", + "score": 0.2448275862068965, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "image_captioning_with_additional_requirements", + "score": 0.4, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "visualization_with_code", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocrqa", + "score": 0.5689655172413793, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "table2latex_complex", + "score": 0.08888888888888889, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "graph_interpretation", + "score": 0.36896551724137927, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "defeasible_reasoning", + "score": 0.5655172413793104, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "image_humor_understanding", + "score": 0.5896551724137931, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "bar_chart_interpretation", + "score": 0.27586206896551724, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "humor_explanation", + "score": 0.26, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "vibe-eval", + "score": 0.27142857142857146, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "art_explanation", + "score": 0.4551724137931034, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "traffic_accident_analysis", + "score": 0.2642857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "electrocardiogram", + "score": 0.09285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "GUI_Chat_Easy", + "score": 0.4846153846153847, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26 + }, + { + "name": "funny_image_title", + "score": 0.37142857142857144, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "scibench_w_solution_open_ended", + "score": 0.122, + "eval_type": "llm", + "num_demo": 1, + "num_query": 25 + }, + { + "name": "meme_explain", + "score": 0.23571428571428574, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "GUI_Chat_Hard", + "score": 0.435483870967742, + "eval_type": "llm", + "num_demo": 1, + "num_query": 31 + }, + { + "name": "unusual_images", + "score": 0.4758620689655173, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "guess_image_generation_prompt", + "score": 0.5842105263157894, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "docci_image_description_long", + "score": 0.42857142857142855, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ascii_art_30", + "score": 0.6428571428571429, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Spanish", + "score": 0.08571428571428572, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_French", + "score": 0.03571428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Russian", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_English", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Arabic", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Japanese", + "score": 0.03571428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_fetaqa", + "score": 0.22142857142857145, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bridge_strategies_advanced", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bridge_strategies_expert", + "score": 0.20714285714285713, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bridge_strategies_worldclass", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "red_teaming_captcha", + "score": 0.10000000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "red_teaming_visualmisleading", + "score": 0.8315789473684213, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "red_teaming_celebrity", + "score": 0.8650000000000002, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_politics", + "score": 0.68, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_jailbreak", + "score": 0.2799999999999999, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_racial", + "score": 0.6399999999999999, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + } +] \ No newline at end of file diff --git a/static/eval_results/SI/InternVL2_2B/summary_results.json b/static/eval_results/SI/InternVL2_2B/summary_results.json new file mode 100644 index 0000000000000000000000000000000000000000..b021be0db3f55396a6b2deb794e0c5aeea0bf1a8 --- /dev/null +++ b/static/eval_results/SI/InternVL2_2B/summary_results.json @@ -0,0 +1,219 @@ +{ + "model_summary": { + "core": { + "num_eval_tasks": 273, + "num_eval_samples": 4116, + "num_not_eval_samples": 0, + "macro_mean_score": 0.12069001041308772, + "micro_mean_score": 0.11842605219090299, + "missing_tasks": [] + }, + "open": { + "num_eval_tasks": 42, + "num_eval_samples": 813, + "num_not_eval_samples": 0, + "macro_mean_score": 0.28522459992910454, + "micro_mean_score": 0.28886838868388687, + "missing_tasks": [] + }, + "overall_score": 0.14262795568189 + }, + "keyword_stats": { + "skills": { + "Text Recognition (OCR)": { + "count": 101, + "num_samples": 1687, + "tasks": [], + "average_score": 0.12376971454228163 + }, + "Object Recognition and Classification": { + "count": 172, + "num_samples": 2714, + "tasks": [], + "average_score": 0.13333012698269087 + }, + "Scene and Event Understanding": { + "count": 60, + "num_samples": 1004, + "tasks": [], + "average_score": 0.23055380602943532 + }, + "Mathematical and Logical Reasoning": { + "count": 91, + "num_samples": 1630, + "tasks": [], + "average_score": 0.1336101595652968 + }, + "Language Understanding and Generation": { + "count": 102, + "num_samples": 1713, + "tasks": [], + "average_score": 0.1905261989833371 + }, + "Domain-Specific Knowledge and Skills": { + "count": 46, + "num_samples": 897, + "tasks": [], + "average_score": 0.08201891993308255 + }, + "Spatial and Temporal Reasoning": { + "count": 78, + "num_samples": 1273, + "tasks": [], + "average_score": 0.11985812372011641 + }, + "Planning and Decision Making": { + "count": 23, + "num_samples": 356, + "tasks": [], + "average_score": 0.013664596273291925 + }, + "Commonsense and Social Reasoning": { + "count": 38, + "num_samples": 654, + "tasks": [], + "average_score": 0.3035836752792625 + }, + "Ethical and Safety Reasoning": { + "count": 10, + "num_samples": 170, + "tasks": [], + "average_score": 0.4728533834586467 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 67, + "num_samples": 1123, + "tasks": [], + "average_score": 0.049217594376760605 + }, + "Text-Based Images and Documents": { + "count": 53, + "num_samples": 847, + "tasks": [], + "average_score": 0.09447908809124074 + }, + "Photographs": { + "count": 83, + "num_samples": 1315, + "tasks": [], + "average_score": 0.22923075081716637 + }, + "Diagrams and Data Visualizations": { + "count": 88, + "num_samples": 1524, + "tasks": [], + "average_score": 0.15159509081542988 + }, + "Artistic and Creative Content": { + "count": 22, + "num_samples": 389, + "tasks": [], + "average_score": 0.18693717087010792 + }, + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.07184873949579831 + } + }, + "output_format": { + "structured_output": { + "count": 72, + "num_samples": 1121, + "tasks": [], + "average_score": 0.042960275283590164 + }, + "contextual_formatted_text": { + "count": 63, + "num_samples": 975, + "tasks": [], + "average_score": 0.12369372450210245 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.11544832152620972 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.12291071957107838 + }, + "open_ended_output": { + "count": 51, + "num_samples": 991, + "tasks": [], + "average_score": 0.24746476545671045 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.3044601862783681 + } + }, + "input_num": { + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.14262795568189013 + } + }, + "app": { + "Information_Extraction": { + "count": 41, + "num_samples": 644, + "tasks": [], + "average_score": 0.03678177133215256 + }, + "Planning": { + "count": 44, + "num_samples": 714, + "tasks": [], + "average_score": 0.039950835968771276 + }, + "Coding": { + "count": 16, + "num_samples": 244, + "tasks": [], + "average_score": 0.09082268323996265 + }, + "Perception": { + "count": 82, + "num_samples": 1321, + "tasks": [], + "average_score": 0.19769593666817548 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.3428571428571428 + }, + "Science": { + "count": 22, + "num_samples": 469, + "tasks": [], + "average_score": 0.15289272275533383 + }, + "Knowledge": { + "count": 77, + "num_samples": 1294, + "tasks": [], + "average_score": 0.20753533217719797 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.12084183290294437 + } + } + } +} \ No newline at end of file diff --git a/static/eval_results/SI/InternVL2_2B/task_results.json b/static/eval_results/SI/InternVL2_2B/task_results.json new file mode 100644 index 0000000000000000000000000000000000000000..2e8cd4636185c55541c18106507a48e601a70573 --- /dev/null +++ b/static/eval_results/SI/InternVL2_2B/task_results.json @@ -0,0 +1,2207 @@ +[ + { + "name": "location_vqa", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "scibench_calculus_wo_solution", + "score": 0.02040816326530612, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49 + }, + { + "name": "logical_reasoning_find_odd_one_out", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "relative_depth_of_different_points", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "signage_navigation", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "road_map_find_highway_between_two_place", + "score": 0.11764705882352941, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "logical_reasoning_fit_pattern", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "entertainment_web_game_style", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "game_platform_support_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ascii_art_understanding", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "scibench_fundamental_wo_solution", + "score": 0.04081632653061224, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49 + }, + { + "name": "GUI_Act_Mobile_swipe", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "extract_webpage_headline", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "distinguish_ai_generated_image", + "score": 0.21052631578947367, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "geometry_reasoning_nested_squares", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_cell_recognition", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_image_artifacts_indentification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_tyreworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "geometry_reasoning_overlapped_circle", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "monthly_weather_days_count", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "CLEVRER_physics", + "score": 0.35, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20 + }, + { + "name": "GUI_Act_Mobile_tap", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "character_recognition_in_TV_shows", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "circuit_diagram_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "music_sheet_note_count", + "score": 0.11764705882352941, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "MMMU_physics_chemistry_MCQ", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_find_legal_moves", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_visual_storage", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "medical_counting_lymphocytes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "code_execution", + "score": 0.0625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "planning_screenshot_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "chess_sygyzy_endgames", + "score": 0.042857142857142864, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cultural_vqa", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "hashtag_recommendation", + "score": 0.7511904761904761, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_grid", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "annoying_word_search", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "healthcare_info_judgement", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "webpage_code_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "font_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "relative_reflectance_of_different_regions", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "calendar_schedule_suggestion", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mensa_iq_test", + "score": 0.07843137254901962, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "medical_multi_organ_segmentation_rater", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "product_ocr_qa", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "counting", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_blocksworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "MMMU_pro_exam_screenshot", + "score": 0.020202020202020204, + "eval_type": "rule", + "num_demo": 1, + "num_query": 99 + }, + { + "name": "chinese_idiom_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ishihara_test", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "Ad_count_detection", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "number_comparison", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "song_title_identification_from_lyrics", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "paper_vqa", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "polygon_interior_angles", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "play_go_capture_stone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "mindmap_elements_parsing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_storage", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "rebus", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 23 + }, + { + "name": "music_sheet_format_QA", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "autonomous_driving_scene_analysis", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "transit_map_intersection_points", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mahjong", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "LaTeX_complex_formula_convertion", + "score": 0.11764705882352941, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "GUI_Act_Web_Single", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "image_translation_en2cn", + "score": 0.1616282025296959, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "actor_recognition_in_Movie", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_blood_vessels_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "remaining_playback_time_calculation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "realworld_qa_en2cn", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "soccer_offside", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "recover_masked_word_in_figure", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "top_video_creator_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "web_action_grounding", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "arxiv_vqa", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "interpret_force_perspective_illusion", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "orchestra_score_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "GUI_Act_Web_Multi", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "llavaguard", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "icon_arithmetic_puzzle", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_count_line_intersections", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "web_action_prediction", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "knowledge_graph_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "position_relationship", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "Bongard_Problem", + "score": 0.15789473684210525, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "geometry_reasoning_circled_letter", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "flowchart_code_generation", + "score": 0.2222222222222222, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "topological_sort", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_grippers", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "weather_map_climate_type_temperature_parsing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "highest_discount_game_price_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "media_QA_web_stackoverflow", + "score": 0.023809523809523808, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "newspaper_ocr_in_query_box", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "electricity_plot_future_prediction", + "score": 0.22943157894736843, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "coco_person_detection", + "score": 0.04795530132517215, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_length", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "weather_info_parsing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "physical_property_reasoning", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "face_keypoint_detection", + "score": 0.025015859584362813, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_maxflow", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "geometry_transformation", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_puzzle_single_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "ili_ratio_future_prediction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_analytic", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mvsa_sentiment_classification", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_theory", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "3d_indoor_scene_text_bbox_selection", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "funsd_document_qa", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "single_person_pose_estimation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "deciphering_oracle_bone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "license_plate_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "science_molecule_chemistry", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "snli_ve_visual_entailment", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "table_understanding", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_solid", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "tqa_textbook_qa", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "music_info_parsing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_area", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "dvqa", + "score": 0.7368421052631579, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "widerface_face_count_and_event_classification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "long_string_letter_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_winner_identification", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "math_convexity_value_estimation", + "score": 0.3781550944997167, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "ti_fused_vqa_math", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "human_relationship_reasoning", + "score": 0.1875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "vibe_eval_short_phrase", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "question_solution_solving", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "game_info_parsing", + "score": 0.032467532467532464, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "top_rated_hotel_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "stock_price_future_prediction", + "score": 0.20764285714285705, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "algebra", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_shortest_path_planar", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "map_diagram_qa", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_shortest_path_kamada_kawai", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_chordless_cycle", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "waybill_number_sequence_extraction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "iconqa", + "score": 0.10526315789473684, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "graph_connectivity", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "mnist_pattern", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "egocentric_analysis_single_image", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "image_style_recognition", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "geometry_descriptive", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "electricity_load_estimate_plot", + "score": 0.3585714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "insect_order_classification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "ti_fused_vqa_biology", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "youtube_video_info_parsing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "quizlet_question_solving", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ti_fused_vqa_chemistry", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "exchange_rate_estimate_plot", + "score": 0.4424142857142856, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multilingual_movie_info_parsing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "hotel_booking_confirmation_parsing", + "score": 0.09285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "code_programming_test_easy", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 24 + }, + { + "name": "brand_logo_recognition_and_elaboration", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 25 + }, + { + "name": "movie_info_parsing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "3d_indoor_scene_text_bbox_prediction", + "score": 0.0008403361344537821, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "TV_show_info_parsing", + "score": 0.03968253968253969, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "stock_info_parsing", + "score": 0.004201680672268907, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "math_parity", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "famous_building_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "egocentric_spatial_reasoning", + "score": 0.2222222222222222, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "nlvr2_two_image_compare_qa", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_isomorphism", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "landmark_recognition_and_qa", + "score": 0.08888888888888888, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "handwritten_math_expression_extraction", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ti_fused_vqa_physics", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pmc_vqa_medical_image_qa", + "score": 0.21052631578947367, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "long_string_number_recognition", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "electricity_future_prediction_from_table", + "score": 0.4773684210526316, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "places365_scene_type_classification", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "super_clevr", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "animal_pose_estimation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "science_basic_physics", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "kvqa_knowledge_aware_qa", + "score": 0.05263157894736842, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "math_breakpoint", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "newspaper_page_parse_and_count", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "multilingual_game_info_parsing", + "score": 0.017857142857142856, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "signboard_identification", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "chart_vqa", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "average_humidity_estimate_plot", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "humor_understand_caption_match", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "figureqa", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "coco_object_detection_by_query_property", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "traffic_future_prediction_from_line_plot", + "score": 0.513578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "chess_puzzles_crushing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_puzzles_equality", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "chess_puzzles_checkmate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_papers", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "memorization_indian_celebrity", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_chinese_celebrity", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_famous_treaty", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "reward_models_I2T_reward", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "crossword_mini_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "number_puzzle_sudoku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "number_puzzle_kakuro_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "panel_images_single_question", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "panel_images_multi_question", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_complex_question_answering", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_fact_verification", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "constrained_generation_contain_repeat_length", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_contain_length", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_length", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_position_length", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "maze_2d_8x8", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Misinformation_PolitiFact", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Misinformation_GossipCop", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_HatefulMemes", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Memotion", + "score": 0.15294117647058825, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "ocr_table_to_latex", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_csv", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_markdown", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_experience_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_employer_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_article_authors", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_article_journal", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_skill_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_text_latex", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_equation", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_html", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_school_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_doodle_guess", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "pictionary_chinese_food_img2en", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_cartoon_drawing_guess", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_genai_output_chinese", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_skribbl_io", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20 + }, + { + "name": "counterfactual_arithmetic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_youtube", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_iphone_settings", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_notes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_word", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_alipay", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "app_layout_understanding_twitter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_leetcode", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_tiktok", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_excel", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_instagram", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_ppt", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_zoom", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "app_layout_understanding_amazon", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_face_attribute_edit", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_out_of_context", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_background_change", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_text_entity_replace", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_clip_stable_diffusion_generate", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_text_style", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_veracity", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_face_swap", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_blogpost", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_homepage", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_publication", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "autorater_artifact_reason", + "score": 0.6, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "autorater_artifact", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "shape_composition_shapes", + "score": 0.04591836734693877, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "shape_composition_colours", + "score": 0.18534580498866213, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_TheoremQA", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_MATH", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "poetry_petrarchian_sonnet_optional_meter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_shakespearean_sonnet", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_custom_rhyming_scheme", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_haiku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_acrostic_alliteration", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_limerick", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_acrostic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "symbolic_graphics_programs_computer_aided_design", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "symbolic_graphics_programs_scalable_vector_graphics", + "score": 0.1111111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18 + }, + { + "name": "cvbench_adapted_cvbench_depth", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_relation", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_distance", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_count", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_africa", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_asia", + "score": 0.014285714285714287, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_americas", + "score": 0.014285714285714287, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_europe", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "figurative_speech_explanation", + "score": 0.2275862068965517, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "tweets_captioning", + "score": 0.4499999999999999, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "science_figure_explanation", + "score": 0.1827586206896552, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "iq_test", + "score": 0.031034482758620686, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "image_captioning_with_additional_requirements", + "score": 0.5428571428571429, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "visualization_with_code", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocrqa", + "score": 0.37241379310344824, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "table2latex_complex", + "score": 0.3444444444444445, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "graph_interpretation", + "score": 0.196551724137931, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "defeasible_reasoning", + "score": 0.296551724137931, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "image_humor_understanding", + "score": 0.27241379310344827, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "bar_chart_interpretation", + "score": 0.1827586206896551, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "humor_explanation", + "score": 0.22666666666666663, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "vibe-eval", + "score": 0.39999999999999997, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "art_explanation", + "score": 0.1586206896551724, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "traffic_accident_analysis", + "score": 0.2571428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "electrocardiogram", + "score": 0.24285714285714285, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "GUI_Chat_Easy", + "score": 0.5423076923076923, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26 + }, + { + "name": "funny_image_title", + "score": 0.5642857142857142, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "scibench_w_solution_open_ended", + "score": 0.054000000000000006, + "eval_type": "llm", + "num_demo": 1, + "num_query": 25 + }, + { + "name": "meme_explain", + "score": 0.3714285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "GUI_Chat_Hard", + "score": 0.4741935483870969, + "eval_type": "llm", + "num_demo": 1, + "num_query": 31 + }, + { + "name": "unusual_images", + "score": 0.2413793103448276, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "guess_image_generation_prompt", + "score": 0.7157894736842104, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "docci_image_description_long", + "score": 0.5214285714285715, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ascii_art_30", + "score": 0.21428571428571427, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Spanish", + "score": 0.04285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_French", + "score": 0.07142857142857142, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Russian", + "score": 0.08571428571428572, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_English", + "score": 0.14285714285714285, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Arabic", + "score": 0.0642857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Japanese", + "score": 0.04285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_fetaqa", + "score": 0.32142857142857145, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bridge_strategies_advanced", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bridge_strategies_expert", + "score": 0.05714285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bridge_strategies_worldclass", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "red_teaming_captcha", + "score": 0.10000000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "red_teaming_visualmisleading", + "score": 0.8421052631578949, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "red_teaming_celebrity", + "score": 0.7100000000000003, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_politics", + "score": 0.5700000000000001, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_jailbreak", + "score": 0.27, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_racial", + "score": 0.5750000000000001, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + } +] \ No newline at end of file diff --git a/static/eval_results/SI/InternVL2_76B/summary_results.json b/static/eval_results/SI/InternVL2_76B/summary_results.json new file mode 100644 index 0000000000000000000000000000000000000000..6aa5a3d95b342f82d82ffccd2bcf0a2a4db5aeab --- /dev/null +++ b/static/eval_results/SI/InternVL2_76B/summary_results.json @@ -0,0 +1,219 @@ +{ + "model_summary": { + "core": { + "num_eval_tasks": 273, + "num_eval_samples": 4116, + "num_not_eval_samples": 0, + "macro_mean_score": 0.3998616568018755, + "micro_mean_score": 0.39149064302628933, + "missing_tasks": [] + }, + "open": { + "num_eval_tasks": 42, + "num_eval_samples": 813, + "num_not_eval_samples": 0, + "macro_mean_score": 0.554748737158244, + "micro_mean_score": 0.5800738007380073, + "missing_tasks": [] + }, + "overall_score": 0.42051326751605805 + }, + "keyword_stats": { + "skills": { + "Text Recognition (OCR)": { + "count": 101, + "num_samples": 1687, + "tasks": [], + "average_score": 0.4672429826553732 + }, + "Object Recognition and Classification": { + "count": 172, + "num_samples": 2714, + "tasks": [], + "average_score": 0.4230856844269695 + }, + "Scene and Event Understanding": { + "count": 60, + "num_samples": 1004, + "tasks": [], + "average_score": 0.570666577587141 + }, + "Mathematical and Logical Reasoning": { + "count": 91, + "num_samples": 1630, + "tasks": [], + "average_score": 0.3413715846680563 + }, + "Language Understanding and Generation": { + "count": 102, + "num_samples": 1713, + "tasks": [], + "average_score": 0.470239452171767 + }, + "Domain-Specific Knowledge and Skills": { + "count": 46, + "num_samples": 897, + "tasks": [], + "average_score": 0.37110860027855824 + }, + "Spatial and Temporal Reasoning": { + "count": 78, + "num_samples": 1273, + "tasks": [], + "average_score": 0.3276283897777921 + }, + "Planning and Decision Making": { + "count": 23, + "num_samples": 356, + "tasks": [], + "average_score": 0.10153556963007855 + }, + "Commonsense and Social Reasoning": { + "count": 38, + "num_samples": 654, + "tasks": [], + "average_score": 0.5873606708191794 + }, + "Ethical and Safety Reasoning": { + "count": 10, + "num_samples": 170, + "tasks": [], + "average_score": 0.7041804511278196 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 67, + "num_samples": 1123, + "tasks": [], + "average_score": 0.39401514252711556 + }, + "Text-Based Images and Documents": { + "count": 53, + "num_samples": 847, + "tasks": [], + "average_score": 0.3333759749379774 + }, + "Photographs": { + "count": 83, + "num_samples": 1315, + "tasks": [], + "average_score": 0.5065289649268628 + }, + "Diagrams and Data Visualizations": { + "count": 88, + "num_samples": 1524, + "tasks": [], + "average_score": 0.39253566766026804 + }, + "Artistic and Creative Content": { + "count": 22, + "num_samples": 389, + "tasks": [], + "average_score": 0.509332186292545 + }, + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.30169355252977215 + } + }, + "output_format": { + "structured_output": { + "count": 72, + "num_samples": 1121, + "tasks": [], + "average_score": 0.4030580663588658 + }, + "contextual_formatted_text": { + "count": 63, + "num_samples": 975, + "tasks": [], + "average_score": 0.3863929410693585 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.4041893680050902 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.34950523809271744 + }, + "open_ended_output": { + "count": 51, + "num_samples": 991, + "tasks": [], + "average_score": 0.48322911874283003 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.5389260571078752 + } + }, + "input_num": { + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.4205132675160581 + } + }, + "app": { + "Information_Extraction": { + "count": 41, + "num_samples": 644, + "tasks": [], + "average_score": 0.4585598678029664 + }, + "Planning": { + "count": 44, + "num_samples": 714, + "tasks": [], + "average_score": 0.1619462380866451 + }, + "Coding": { + "count": 16, + "num_samples": 244, + "tasks": [], + "average_score": 0.42624956232493 + }, + "Perception": { + "count": 82, + "num_samples": 1321, + "tasks": [], + "average_score": 0.5361401478255164 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.5301587301587302 + }, + "Science": { + "count": 22, + "num_samples": 469, + "tasks": [], + "average_score": 0.38874625564304305 + }, + "Knowledge": { + "count": 77, + "num_samples": 1294, + "tasks": [], + "average_score": 0.47251288369387245 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.3075073077960568 + } + } + } +} \ No newline at end of file diff --git a/static/eval_results/SI/InternVL2_76B/task_results.json b/static/eval_results/SI/InternVL2_76B/task_results.json new file mode 100644 index 0000000000000000000000000000000000000000..e8028662a4e4e2be537ea34d75d0f5b930e9cc88 --- /dev/null +++ b/static/eval_results/SI/InternVL2_76B/task_results.json @@ -0,0 +1,2207 @@ +[ + { + "name": "science_molecule_chemistry", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "signboard_identification", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "funsd_document_qa", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "physical_property_reasoning", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_area", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_analytic", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "3d_indoor_scene_text_bbox_selection", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "map_diagram_qa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_connectivity", + "score": 0.48333333333333334, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "graph_isomorphism", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "question_solution_solving", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "snli_ve_visual_entailment", + "score": 0.8666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "ti_fused_vqa_chemistry", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ili_ratio_future_prediction", + "score": 0.07099999999999997, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "image_style_recognition", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "multilingual_movie_info_parsing", + "score": 0.5204081632653061, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_transformation", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "vibe_eval_short_phrase", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "math_breakpoint", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "waybill_number_sequence_extraction", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "brand_logo_recognition_and_elaboration", + "score": 0.68, + "eval_type": "rule", + "num_demo": 1, + "num_query": 25 + }, + { + "name": "license_plate_recognition", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_puzzle_single_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "code_programming_test_easy", + "score": 0.041666666666666664, + "eval_type": "rule", + "num_demo": 1, + "num_query": 24 + }, + { + "name": "chess_winner_identification", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "chart_vqa", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "math_convexity_value_estimation", + "score": 0.474485256080059, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "3d_indoor_scene_text_bbox_prediction", + "score": 0.03195853363097289, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "long_string_letter_recognition", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "dvqa", + "score": 0.7894736842105263, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "egocentric_spatial_reasoning", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "quizlet_question_solving", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "coco_person_detection", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_maxflow", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "tqa_textbook_qa", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "game_info_parsing", + "score": 0.8961038961038961, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "animal_pose_estimation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "humor_understand_caption_match", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "table_understanding", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "super_clevr", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "average_humidity_estimate_plot", + "score": 0.6839999999999999, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "nlvr2_two_image_compare_qa", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "face_keypoint_detection", + "score": 0.41731304954954773, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "insect_order_classification", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "deciphering_oracle_bone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "places365_scene_type_classification", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "science_basic_physics", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "long_string_number_recognition", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "handwritten_math_expression_extraction", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "newspaper_ocr_in_query_box", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "graph_theory", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "landmark_recognition_and_qa", + "score": 0.4888888888888889, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "geometry_solid", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "top_rated_hotel_identification", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "stock_info_parsing", + "score": 0.8949579831932775, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pmc_vqa_medical_image_qa", + "score": 0.5263157894736842, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "electricity_future_prediction_from_table", + "score": 0.5306842105263159, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "TV_show_info_parsing", + "score": 0.7460317460317459, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "coco_object_detection_by_query_property", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "figureqa", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ti_fused_vqa_biology", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "traffic_future_prediction_from_line_plot", + "score": 0.7437368421052631, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "mvsa_sentiment_classification", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "stock_price_future_prediction", + "score": 0.748642857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "kvqa_knowledge_aware_qa", + "score": 0.5789473684210527, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "hotel_booking_confirmation_parsing", + "score": 0.5857142857142856, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_shortest_path_planar", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "math_parity", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "famous_building_recognition", + "score": 0.84375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "multilingual_game_info_parsing", + "score": 0.45535714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "weather_info_parsing", + "score": 0.892857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_shortest_path_kamada_kawai", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "widerface_face_count_and_event_classification", + "score": 0.4642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ti_fused_vqa_physics", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_chordless_cycle", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_length", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "algebra", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "human_relationship_reasoning", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "exchange_rate_estimate_plot", + "score": 0.9443785714285712, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "electricity_load_estimate_plot", + "score": 0.5548571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "single_person_pose_estimation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ti_fused_vqa_math", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "electricity_plot_future_prediction", + "score": 0.6581526315789472, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "iconqa", + "score": 0.3684210526315789, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "movie_info_parsing", + "score": 0.5535714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "youtube_video_info_parsing", + "score": 0.7380952380952381, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "egocentric_analysis_single_image", + "score": 0.7777777777777778, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "music_info_parsing", + "score": 0.5982142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mnist_pattern", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "newspaper_page_parse_and_count", + "score": 0.4888888888888889, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "geometry_descriptive", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_relation", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_depth", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_distance", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_count", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "symbolic_graphics_programs_computer_aided_design", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "symbolic_graphics_programs_scalable_vector_graphics", + "score": 0.2777777777777778, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18 + }, + { + "name": "multiple_states_identify_asia", + "score": 0.11428571428571428, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_americas", + "score": 0.11428571428571428, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_europe", + "score": 0.028571428571428574, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_africa", + "score": 0.042857142857142864, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "flowchart_code_generation", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "healthcare_info_judgement", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "relative_depth_of_different_points", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_visual_storage", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "Ad_count_detection", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_sygyzy_endgames", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "web_action_grounding", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "position_relationship", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "GUI_Act_Web_Multi", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "webpage_code_understanding", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "extract_webpage_headline", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "music_sheet_note_count", + "score": 0.11764705882352941, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "geometry_reasoning_circled_letter", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "logical_reasoning_find_odd_one_out", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "paper_vqa", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "remaining_playback_time_calculation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "interpret_force_perspective_illusion", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "GUI_Act_Web_Single", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "hashtag_recommendation", + "score": 0.8095238095238094, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_tyreworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "weather_map_climate_type_temperature_parsing", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "scibench_calculus_wo_solution", + "score": 0.16326530612244897, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49 + }, + { + "name": "MMMU_physics_chemistry_MCQ", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "image_translation_en2cn", + "score": 0.3511830960216528, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "game_platform_support_identification", + "score": 0.75, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "music_sheet_format_QA", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cultural_vqa", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "CLEVRER_physics", + "score": 0.35, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20 + }, + { + "name": "location_vqa", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_nested_squares", + "score": 0.10714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "circuit_diagram_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "scibench_fundamental_wo_solution", + "score": 0.20408163265306123, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49 + }, + { + "name": "medical_cell_recognition", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "planning_screenshot_blocksworld", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "planning_screenshot_storage", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "distinguish_ai_generated_image", + "score": 0.631578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "geometry_reasoning_count_line_intersections", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_grippers", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "llavaguard", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mindmap_elements_parsing", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "autonomous_driving_scene_analysis", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "realworld_qa_en2cn", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "orchestra_score_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "highest_discount_game_price_identification", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "counting", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "media_QA_web_stackoverflow", + "score": 0.5476190476190477, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "soccer_offside", + "score": 0.2222222222222222, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "song_title_identification_from_lyrics", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMMU_pro_exam_screenshot", + "score": 0.20202020202020202, + "eval_type": "rule", + "num_demo": 1, + "num_query": 99 + }, + { + "name": "medical_multi_organ_segmentation_rater", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "knowledge_graph_understanding", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "mensa_iq_test", + "score": 0.27450980392156865, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "play_go_capture_stone", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "medical_blood_vessels_recognition", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "web_action_prediction", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "arxiv_vqa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "top_video_creator_identification", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "annoying_word_search", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "GUI_Act_Mobile_tap", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_grid", + "score": 0.5357142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "GUI_Act_Mobile_swipe", + "score": 0.5316511584711222, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "recover_masked_word_in_figure", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "character_recognition_in_TV_shows", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "relative_reflectance_of_different_regions", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_image_artifacts_indentification", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "signage_navigation", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "transit_map_intersection_points", + "score": 0.3136904761904762, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "code_execution", + "score": 0.0625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "icon_arithmetic_puzzle", + "score": 0.17857142857142858, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_overlapped_circle", + "score": 0.5357142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chinese_idiom_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "monthly_weather_days_count", + "score": 0.2619047619047619, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "calendar_schedule_suggestion", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ascii_art_understanding", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_find_legal_moves", + "score": 0.017952657306398452, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "topological_sort", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "entertainment_web_game_style", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "polygon_interior_angles", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mahjong", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "font_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "number_comparison", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_counting_lymphocytes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "actor_recognition_in_Movie", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "product_ocr_qa", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "Bongard_Problem", + "score": 0.2631578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "logical_reasoning_fit_pattern", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "road_map_find_highway_between_two_place", + "score": 0.7058823529411765, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "ishihara_test", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "LaTeX_complex_formula_convertion", + "score": 0.47058823529411764, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "rebus", + "score": 0.17391304347826086, + "eval_type": "rule", + "num_demo": 1, + "num_query": 23 + }, + { + "name": "constrained_generation_contain_position_length", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_repeat_length", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_contain_length", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_length", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "chess_puzzles_equality", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "chess_puzzles_crushing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_puzzles_checkmate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "panel_images_single_question", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "panel_images_multi_question", + "score": 0.761904761904762, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "crossword_mini_5x5", + "score": 0.1285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_HatefulMemes", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Misinformation_GossipCop", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Misinformation_PolitiFact", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Memotion", + "score": 0.6117647058823531, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "reward_models_I2T_reward", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_MATH", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "ocr_math_TheoremQA", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "counterfactual_arithmetic", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_homepage", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_publication", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_blogpost", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_complex_question_answering", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_fact_verification", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_face_swap", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_text_style", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_clip_stable_diffusion_generate", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_veracity", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_face_attribute_edit", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_text_entity_replace", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_out_of_context", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_background_change", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "number_puzzle_kakuro_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "number_puzzle_sudoku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "maze_2d_8x8", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "shape_composition_shapes", + "score": 0.2584183673469388, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "shape_composition_colours", + "score": 0.21468253968253967, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_chinese_celebrity", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_indian_celebrity", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_papers", + "score": 0.43333333333333335, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "memorization_famous_treaty", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_csv", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_school_plain", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_text_latex", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_skill_plain", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_experience_plain", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_article_authors", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_markdown", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_html", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_employer_plain", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_equation", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_article_journal", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_latex", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "autorater_artifact", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "autorater_artifact_reason", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_haiku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_acrostic", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_limerick", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_petrarchian_sonnet_optional_meter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_custom_rhyming_scheme", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_acrostic_alliteration", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_shakespearean_sonnet", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "pictionary_cartoon_drawing_guess", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_doodle_guess", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "pictionary_skribbl_io", + "score": 0.1, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20 + }, + { + "name": "pictionary_genai_output_chinese", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_chinese_food_img2en", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_tiktok", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_notes", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_alipay", + "score": 0.5294117647058824, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "app_layout_understanding_amazon", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_instagram", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_ppt", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_leetcode", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_twitter", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_zoom", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "app_layout_understanding_iphone_settings", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_youtube", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_excel", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_word", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "scibench_w_solution_open_ended", + "score": 0.21999999999999997, + "eval_type": "llm", + "num_demo": 1, + "num_query": 25 + }, + { + "name": "electrocardiogram", + "score": 0.35714285714285715, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "funny_image_title", + "score": 0.6714285714285715, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "image_captioning_with_additional_requirements", + "score": 0.9071428571428573, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "meme_explain", + "score": 0.6499999999999998, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "image_humor_understanding", + "score": 0.7620689655172415, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "tweets_captioning", + "score": 0.4714285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "defeasible_reasoning", + "score": 0.7586206896551724, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "vibe-eval", + "score": 0.5928571428571429, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocrqa", + "score": 0.7999999999999999, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "figurative_speech_explanation", + "score": 0.7827586206896551, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "docci_image_description_long", + "score": 0.7785714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bar_chart_interpretation", + "score": 0.489655172413793, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "iq_test", + "score": 0.4620689655172413, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "unusual_images", + "score": 0.7068965517241379, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "GUI_Chat_Hard", + "score": 0.5677419354838709, + "eval_type": "llm", + "num_demo": 1, + "num_query": 31 + }, + { + "name": "graph_interpretation", + "score": 0.7103448275862067, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "traffic_accident_analysis", + "score": 0.4428571428571429, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "humor_explanation", + "score": 0.7400000000000001, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "GUI_Chat_Easy", + "score": 0.6038461538461539, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26 + }, + { + "name": "table2latex_complex", + "score": 0.6333333333333333, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "visualization_with_code", + "score": 0.35000000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "science_figure_explanation", + "score": 0.706896551724138, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "guess_image_generation_prompt", + "score": 0.8421052631578949, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "art_explanation", + "score": 0.6034482758620691, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "bridge_strategies_worldclass", + "score": 0.22857142857142862, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bridge_strategies_expert", + "score": 0.2357142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bridge_strategies_advanced", + "score": 0.0642857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Spanish", + "score": 0.5, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Arabic", + "score": 0.5428571428571429, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Japanese", + "score": 0.39999999999999997, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Russian", + "score": 0.23571428571428574, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_English", + "score": 0.6071428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_French", + "score": 0.45, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_fetaqa", + "score": 0.42857142857142855, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "red_teaming_racial", + "score": 0.67, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_captcha", + "score": 0.10000000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "red_teaming_celebrity", + "score": 0.875, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_politics", + "score": 0.7200000000000001, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_visualmisleading", + "score": 0.8789473684210528, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "red_teaming_jailbreak", + "score": 0.6800000000000002, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "ascii_art_30", + "score": 0.07142857142857142, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + } +] \ No newline at end of file diff --git a/static/eval_results/SI/InternVL2_8B/summary_results.json b/static/eval_results/SI/InternVL2_8B/summary_results.json new file mode 100644 index 0000000000000000000000000000000000000000..6626dca0ba52f31da57208eedcad8531070e052c --- /dev/null +++ b/static/eval_results/SI/InternVL2_8B/summary_results.json @@ -0,0 +1,219 @@ +{ + "model_summary": { + "core": { + "num_eval_tasks": 273, + "num_eval_samples": 4116, + "num_not_eval_samples": 0, + "macro_mean_score": 0.27650612401825575, + "micro_mean_score": 0.27119471729837735, + "missing_tasks": [] + }, + "open": { + "num_eval_tasks": 42, + "num_eval_samples": 813, + "num_not_eval_samples": 0, + "macro_mean_score": 0.39388373890935635, + "micro_mean_score": 0.4045510455104551, + "missing_tasks": [] + }, + "overall_score": 0.29215647267040246 + }, + "keyword_stats": { + "skills": { + "Text Recognition (OCR)": { + "count": 101, + "num_samples": 1687, + "tasks": [], + "average_score": 0.30220279568886643 + }, + "Object Recognition and Classification": { + "count": 172, + "num_samples": 2714, + "tasks": [], + "average_score": 0.2915702951202482 + }, + "Scene and Event Understanding": { + "count": 60, + "num_samples": 1004, + "tasks": [], + "average_score": 0.41603267498315427 + }, + "Mathematical and Logical Reasoning": { + "count": 91, + "num_samples": 1630, + "tasks": [], + "average_score": 0.24983605813271914 + }, + "Language Understanding and Generation": { + "count": 102, + "num_samples": 1713, + "tasks": [], + "average_score": 0.3284779417766259 + }, + "Domain-Specific Knowledge and Skills": { + "count": 46, + "num_samples": 897, + "tasks": [], + "average_score": 0.27396131593770284 + }, + "Spatial and Temporal Reasoning": { + "count": 78, + "num_samples": 1273, + "tasks": [], + "average_score": 0.21701915158341967 + }, + "Planning and Decision Making": { + "count": 23, + "num_samples": 356, + "tasks": [], + "average_score": 0.0592961015994038 + }, + "Commonsense and Social Reasoning": { + "count": 38, + "num_samples": 654, + "tasks": [], + "average_score": 0.4403771552269444 + }, + "Ethical and Safety Reasoning": { + "count": 10, + "num_samples": 170, + "tasks": [], + "average_score": 0.6521729323308272 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 67, + "num_samples": 1123, + "tasks": [], + "average_score": 0.22539102164423624 + }, + "Text-Based Images and Documents": { + "count": 53, + "num_samples": 847, + "tasks": [], + "average_score": 0.21516421271234623 + }, + "Photographs": { + "count": 83, + "num_samples": 1315, + "tasks": [], + "average_score": 0.4088467630174509 + }, + "Diagrams and Data Visualizations": { + "count": 88, + "num_samples": 1524, + "tasks": [], + "average_score": 0.27187498646061353 + }, + "Artistic and Creative Content": { + "count": 22, + "num_samples": 389, + "tasks": [], + "average_score": 0.34383350461121587 + }, + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.0503849634147267 + } + }, + "output_format": { + "structured_output": { + "count": 72, + "num_samples": 1121, + "tasks": [], + "average_score": 0.27991889529924496 + }, + "contextual_formatted_text": { + "count": 63, + "num_samples": 975, + "tasks": [], + "average_score": 0.25281668404704594 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.2452385560845516 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.26248166960198344 + }, + "open_ended_output": { + "count": 51, + "num_samples": 991, + "tasks": [], + "average_score": 0.3417106670258814 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.4334863789409244 + } + }, + "input_num": { + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.29215647267040246 + } + }, + "app": { + "Information_Extraction": { + "count": 41, + "num_samples": 644, + "tasks": [], + "average_score": 0.25646898023629483 + }, + "Planning": { + "count": 44, + "num_samples": 714, + "tasks": [], + "average_score": 0.09134825639389237 + }, + "Coding": { + "count": 16, + "num_samples": 244, + "tasks": [], + "average_score": 0.34736300770308126 + }, + "Perception": { + "count": 82, + "num_samples": 1321, + "tasks": [], + "average_score": 0.3784296942438538 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.3253968253968254 + }, + "Science": { + "count": 22, + "num_samples": 469, + "tasks": [], + "average_score": 0.2912783917684807 + }, + "Knowledge": { + "count": 77, + "num_samples": 1294, + "tasks": [], + "average_score": 0.34366199611891174 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.23531351908862871 + } + } + } +} \ No newline at end of file diff --git a/static/eval_results/SI/InternVL2_8B/task_results.json b/static/eval_results/SI/InternVL2_8B/task_results.json new file mode 100644 index 0000000000000000000000000000000000000000..1d85f55b2fb96e1c3aac7ec219d21c83831ccb8c --- /dev/null +++ b/static/eval_results/SI/InternVL2_8B/task_results.json @@ -0,0 +1,2207 @@ +[ + { + "name": "science_molecule_chemistry", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "signboard_identification", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "funsd_document_qa", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "physical_property_reasoning", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_area", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_analytic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "3d_indoor_scene_text_bbox_selection", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "map_diagram_qa", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_connectivity", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "graph_isomorphism", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "question_solution_solving", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "snli_ve_visual_entailment", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "ti_fused_vqa_chemistry", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ili_ratio_future_prediction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "image_style_recognition", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "multilingual_movie_info_parsing", + "score": 0.32653061224489793, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_transformation", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "vibe_eval_short_phrase", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "math_breakpoint", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "waybill_number_sequence_extraction", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "brand_logo_recognition_and_elaboration", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 25 + }, + { + "name": "license_plate_recognition", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_puzzle_single_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "code_programming_test_easy", + "score": 0.125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 24 + }, + { + "name": "chess_winner_identification", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "chart_vqa", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "math_convexity_value_estimation", + "score": 0.3705584548643767, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "3d_indoor_scene_text_bbox_prediction", + "score": 0.029341355400881983, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "long_string_letter_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "dvqa", + "score": 0.8421052631578947, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "egocentric_spatial_reasoning", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "quizlet_question_solving", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "coco_person_detection", + "score": 0.46961677159776144, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_maxflow", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "tqa_textbook_qa", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "game_info_parsing", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "animal_pose_estimation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "humor_understand_caption_match", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "table_understanding", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "super_clevr", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "average_humidity_estimate_plot", + "score": 0.15333333333333338, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "nlvr2_two_image_compare_qa", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "face_keypoint_detection", + "score": 0.6207965121029526, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "insect_order_classification", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "deciphering_oracle_bone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "places365_scene_type_classification", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "science_basic_physics", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "long_string_number_recognition", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "handwritten_math_expression_extraction", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "newspaper_ocr_in_query_box", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "graph_theory", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "landmark_recognition_and_qa", + "score": 0.31111111111111106, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "geometry_solid", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "top_rated_hotel_identification", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "stock_info_parsing", + "score": 0.7521008403361344, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pmc_vqa_medical_image_qa", + "score": 0.3684210526315789, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "electricity_future_prediction_from_table", + "score": 0.5010526315789474, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "TV_show_info_parsing", + "score": 0.46825396825396826, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "coco_object_detection_by_query_property", + "score": 0.4594839638482036, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "figureqa", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ti_fused_vqa_biology", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "traffic_future_prediction_from_line_plot", + "score": 0.3337894736842106, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "mvsa_sentiment_classification", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "stock_price_future_prediction", + "score": 0.6858571428571428, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "kvqa_knowledge_aware_qa", + "score": 0.3684210526315789, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "hotel_booking_confirmation_parsing", + "score": 0.41428571428571426, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_shortest_path_planar", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "math_parity", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "famous_building_recognition", + "score": 0.6875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "multilingual_game_info_parsing", + "score": 0.375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "weather_info_parsing", + "score": 0.4325396825396826, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_shortest_path_kamada_kawai", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "widerface_face_count_and_event_classification", + "score": 0.39285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ti_fused_vqa_physics", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_chordless_cycle", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_length", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "algebra", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "human_relationship_reasoning", + "score": 0.625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "exchange_rate_estimate_plot", + "score": 0.9280335714285712, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "electricity_load_estimate_plot", + "score": 0.47985714285714287, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "single_person_pose_estimation", + "score": 0.011813648192052783, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ti_fused_vqa_math", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "electricity_plot_future_prediction", + "score": 0.5400999999999998, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "iconqa", + "score": 0.15789473684210525, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "movie_info_parsing", + "score": 0.39285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "youtube_video_info_parsing", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "egocentric_analysis_single_image", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "music_info_parsing", + "score": 0.39285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mnist_pattern", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "newspaper_page_parse_and_count", + "score": 0.3777777777777777, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "geometry_descriptive", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_relation", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_depth", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_distance", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_count", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "symbolic_graphics_programs_computer_aided_design", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "symbolic_graphics_programs_scalable_vector_graphics", + "score": 0.05555555555555555, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18 + }, + { + "name": "multiple_states_identify_asia", + "score": 0.1, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_americas", + "score": 0.1142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_europe", + "score": 0.05714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_africa", + "score": 0.05714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "flowchart_code_generation", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "healthcare_info_judgement", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "relative_depth_of_different_points", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_visual_storage", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "Ad_count_detection", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_sygyzy_endgames", + "score": 0.09999999999999999, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "web_action_grounding", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "position_relationship", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "GUI_Act_Web_Multi", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "webpage_code_understanding", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "extract_webpage_headline", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "music_sheet_note_count", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "geometry_reasoning_circled_letter", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "logical_reasoning_find_odd_one_out", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "paper_vqa", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "remaining_playback_time_calculation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "interpret_force_perspective_illusion", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "GUI_Act_Web_Single", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "hashtag_recommendation", + "score": 0.8761904761904761, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_tyreworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "weather_map_climate_type_temperature_parsing", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "scibench_calculus_wo_solution", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49 + }, + { + "name": "MMMU_physics_chemistry_MCQ", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "image_translation_en2cn", + "score": 0.20987470834518032, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "game_platform_support_identification", + "score": 0.10714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "music_sheet_format_QA", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cultural_vqa", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "CLEVRER_physics", + "score": 0.3, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20 + }, + { + "name": "location_vqa", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_nested_squares", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "circuit_diagram_understanding", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "scibench_fundamental_wo_solution", + "score": 0.061224489795918366, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49 + }, + { + "name": "medical_cell_recognition", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "planning_screenshot_blocksworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "planning_screenshot_storage", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "distinguish_ai_generated_image", + "score": 0.7894736842105263, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "geometry_reasoning_count_line_intersections", + "score": 0.32142857142857145, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_grippers", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "llavaguard", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mindmap_elements_parsing", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "autonomous_driving_scene_analysis", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "realworld_qa_en2cn", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "orchestra_score_recognition", + "score": 0.10714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "highest_discount_game_price_identification", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "counting", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "media_QA_web_stackoverflow", + "score": 0.5238095238095238, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "soccer_offside", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "song_title_identification_from_lyrics", + "score": 0.10714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMMU_pro_exam_screenshot", + "score": 0.1717171717171717, + "eval_type": "rule", + "num_demo": 1, + "num_query": 99 + }, + { + "name": "medical_multi_organ_segmentation_rater", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "knowledge_graph_understanding", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "mensa_iq_test", + "score": 0.16176470588235295, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "play_go_capture_stone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "medical_blood_vessels_recognition", + "score": 0.39285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "web_action_prediction", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "arxiv_vqa", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "top_video_creator_identification", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "annoying_word_search", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "GUI_Act_Mobile_tap", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_grid", + "score": 0.32142857142857145, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "GUI_Act_Mobile_swipe", + "score": 0.39826242128287875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "recover_masked_word_in_figure", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "character_recognition_in_TV_shows", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "relative_reflectance_of_different_regions", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_image_artifacts_indentification", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "signage_navigation", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "transit_map_intersection_points", + "score": 0.008928571428571428, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "code_execution", + "score": 0.0625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "icon_arithmetic_puzzle", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_overlapped_circle", + "score": 0.4642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chinese_idiom_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "monthly_weather_days_count", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "calendar_schedule_suggestion", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ascii_art_understanding", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_find_legal_moves", + "score": 0.00602410597959912, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "topological_sort", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "entertainment_web_game_style", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "polygon_interior_angles", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mahjong", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "font_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "number_comparison", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_counting_lymphocytes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "actor_recognition_in_Movie", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "product_ocr_qa", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "Bongard_Problem", + "score": 0.15789473684210525, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "logical_reasoning_fit_pattern", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "road_map_find_highway_between_two_place", + "score": 0.29411764705882354, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "ishihara_test", + "score": 0.1, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "LaTeX_complex_formula_convertion", + "score": 0.29411764705882354, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "rebus", + "score": 0.13043478260869565, + "eval_type": "rule", + "num_demo": 1, + "num_query": 23 + }, + { + "name": "constrained_generation_contain_position_length", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_repeat_length", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_contain_length", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_length", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "chess_puzzles_equality", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "chess_puzzles_crushing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_puzzles_checkmate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "panel_images_single_question", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "panel_images_multi_question", + "score": 0.6190476190476192, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "crossword_mini_5x5", + "score": 0.05, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_HatefulMemes", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Misinformation_GossipCop", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Misinformation_PolitiFact", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Memotion", + "score": 0.32941176470588246, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "reward_models_I2T_reward", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_MATH", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "ocr_math_TheoremQA", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "counterfactual_arithmetic", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_homepage", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_publication", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_blogpost", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_complex_question_answering", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_fact_verification", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_face_swap", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_text_style", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_clip_stable_diffusion_generate", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_veracity", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_face_attribute_edit", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_text_entity_replace", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_out_of_context", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_background_change", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "number_puzzle_kakuro_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "number_puzzle_sudoku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "maze_2d_8x8", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "shape_composition_shapes", + "score": 0.16096938775510203, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "shape_composition_colours", + "score": 0.20918367346938774, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_chinese_celebrity", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_indian_celebrity", + "score": 0.10714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_papers", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "memorization_famous_treaty", + "score": 0.39285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_csv", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_school_plain", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_text_latex", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_skill_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_experience_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_article_authors", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_markdown", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_html", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_employer_plain", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_equation", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_article_journal", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_latex", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "autorater_artifact", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "autorater_artifact_reason", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_haiku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_acrostic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_limerick", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_petrarchian_sonnet_optional_meter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_custom_rhyming_scheme", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_acrostic_alliteration", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_shakespearean_sonnet", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "pictionary_cartoon_drawing_guess", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_doodle_guess", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "pictionary_skribbl_io", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20 + }, + { + "name": "pictionary_genai_output_chinese", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_chinese_food_img2en", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_tiktok", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_notes", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_alipay", + "score": 0.058823529411764705, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "app_layout_understanding_amazon", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_instagram", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_ppt", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_leetcode", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_twitter", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_zoom", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "app_layout_understanding_iphone_settings", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_youtube", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_excel", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_word", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "scibench_w_solution_open_ended", + "score": 0.152, + "eval_type": "llm", + "num_demo": 1, + "num_query": 25 + }, + { + "name": "electrocardiogram", + "score": 0.08571428571428572, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "funny_image_title", + "score": 0.5857142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "image_captioning_with_additional_requirements", + "score": 0.8071428571428573, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "meme_explain", + "score": 0.5285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "image_humor_understanding", + "score": 0.4620689655172414, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "tweets_captioning", + "score": 0.4071428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "defeasible_reasoning", + "score": 0.39655172413793105, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "vibe-eval", + "score": 0.4928571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocrqa", + "score": 0.5413793103448274, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "figurative_speech_explanation", + "score": 0.3586206896551723, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "docci_image_description_long", + "score": 0.6142857142857142, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bar_chart_interpretation", + "score": 0.22068965517241376, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "iq_test", + "score": 0.14827586206896554, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "unusual_images", + "score": 0.4103448275862069, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "GUI_Chat_Hard", + "score": 0.6064516129032257, + "eval_type": "llm", + "num_demo": 1, + "num_query": 31 + }, + { + "name": "graph_interpretation", + "score": 0.4551724137931034, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "traffic_accident_analysis", + "score": 0.09285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "humor_explanation", + "score": 0.41333333333333333, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "GUI_Chat_Easy", + "score": 0.676923076923077, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26 + }, + { + "name": "table2latex_complex", + "score": 0.5111111111111111, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "visualization_with_code", + "score": 0.2, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "science_figure_explanation", + "score": 0.37241379310344824, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "guess_image_generation_prompt", + "score": 0.805263157894737, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "art_explanation", + "score": 0.2379310344827586, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "bridge_strategies_worldclass", + "score": 0.03571428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bridge_strategies_expert", + "score": 0.31428571428571433, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bridge_strategies_advanced", + "score": 0.014285714285714287, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Spanish", + "score": 0.3214285714285715, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Arabic", + "score": 0.028571428571428574, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Japanese", + "score": 0.1285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Russian", + "score": 0.17857142857142858, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_English", + "score": 0.4428571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_French", + "score": 0.34285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_fetaqa", + "score": 0.3785714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "red_teaming_racial", + "score": 0.66, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_captcha", + "score": 0.10000000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "red_teaming_celebrity", + "score": 0.86, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_politics", + "score": 0.6550000000000001, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_visualmisleading", + "score": 0.8631578947368422, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "red_teaming_jailbreak", + "score": 0.5650000000000001, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "ascii_art_30", + "score": 0.07142857142857142, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + } +] \ No newline at end of file diff --git a/static/eval_results/SI/Llama_3_2_11B/summary_results.json b/static/eval_results/SI/Llama_3_2_11B/summary_results.json new file mode 100644 index 0000000000000000000000000000000000000000..1f38c1b0b9723c7ad1eb5965264683efa3b48f73 --- /dev/null +++ b/static/eval_results/SI/Llama_3_2_11B/summary_results.json @@ -0,0 +1,219 @@ +{ + "model_summary": { + "core": { + "num_eval_tasks": 273, + "num_eval_samples": 4116, + "num_not_eval_samples": 0, + "macro_mean_score": 0.20789144960796493, + "micro_mean_score": 0.20163641703273802, + "missing_tasks": [] + }, + "open": { + "num_eval_tasks": 42, + "num_eval_samples": 813, + "num_not_eval_samples": 0, + "macro_mean_score": 0.3861125858565788, + "micro_mean_score": 0.4130381303813038, + "missing_tasks": [] + }, + "overall_score": 0.2316542677744468 + }, + "keyword_stats": { + "skills": { + "Text Recognition (OCR)": { + "count": 101, + "num_samples": 1687, + "tasks": [], + "average_score": 0.20716804318138016 + }, + "Object Recognition and Classification": { + "count": 172, + "num_samples": 2714, + "tasks": [], + "average_score": 0.2546845731733449 + }, + "Scene and Event Understanding": { + "count": 60, + "num_samples": 1004, + "tasks": [], + "average_score": 0.37246318118748967 + }, + "Mathematical and Logical Reasoning": { + "count": 91, + "num_samples": 1630, + "tasks": [], + "average_score": 0.14653430680774066 + }, + "Language Understanding and Generation": { + "count": 102, + "num_samples": 1713, + "tasks": [], + "average_score": 0.25994005315432245 + }, + "Domain-Specific Knowledge and Skills": { + "count": 46, + "num_samples": 897, + "tasks": [], + "average_score": 0.21893599730050764 + }, + "Spatial and Temporal Reasoning": { + "count": 78, + "num_samples": 1273, + "tasks": [], + "average_score": 0.15806381880426276 + }, + "Planning and Decision Making": { + "count": 23, + "num_samples": 356, + "tasks": [], + "average_score": 0.058403715092363084 + }, + "Commonsense and Social Reasoning": { + "count": 38, + "num_samples": 654, + "tasks": [], + "average_score": 0.39649168256429074 + }, + "Ethical and Safety Reasoning": { + "count": 10, + "num_samples": 170, + "tasks": [], + "average_score": 0.5728796992481204 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 67, + "num_samples": 1123, + "tasks": [], + "average_score": 0.18967604731477847 + }, + "Text-Based Images and Documents": { + "count": 53, + "num_samples": 847, + "tasks": [], + "average_score": 0.13775107230512224 + }, + "Photographs": { + "count": 83, + "num_samples": 1315, + "tasks": [], + "average_score": 0.3387317156024255 + }, + "Diagrams and Data Visualizations": { + "count": 88, + "num_samples": 1524, + "tasks": [], + "average_score": 0.1970899659349296 + }, + "Artistic and Creative Content": { + "count": 22, + "num_samples": 389, + "tasks": [], + "average_score": 0.3275861238187186 + }, + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.14822411270107955 + } + }, + "output_format": { + "structured_output": { + "count": 72, + "num_samples": 1121, + "tasks": [], + "average_score": 0.24509462859331077 + }, + "contextual_formatted_text": { + "count": 63, + "num_samples": 975, + "tasks": [], + "average_score": 0.15123880546660726 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.16571305203663964 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.16301171403498463 + }, + "open_ended_output": { + "count": 51, + "num_samples": 991, + "tasks": [], + "average_score": 0.34463240030392384 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.3762691853600945 + } + }, + "input_num": { + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.23165426777444673 + } + }, + "app": { + "Information_Extraction": { + "count": 41, + "num_samples": 644, + "tasks": [], + "average_score": 0.23423754995839735 + }, + "Planning": { + "count": 44, + "num_samples": 714, + "tasks": [], + "average_score": 0.09595984705908096 + }, + "Coding": { + "count": 16, + "num_samples": 244, + "tasks": [], + "average_score": 0.14131944444444444 + }, + "Perception": { + "count": 82, + "num_samples": 1321, + "tasks": [], + "average_score": 0.2740778723883188 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.22857142857142856 + }, + "Science": { + "count": 22, + "num_samples": 469, + "tasks": [], + "average_score": 0.18716549835825297 + }, + "Knowledge": { + "count": 77, + "num_samples": 1294, + "tasks": [], + "average_score": 0.33493936008655223 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.12719796356144183 + } + } + } +} \ No newline at end of file diff --git a/static/eval_results/SI/Llama_3_2_11B/task_results.json b/static/eval_results/SI/Llama_3_2_11B/task_results.json new file mode 100644 index 0000000000000000000000000000000000000000..458370c2887069e76fc00bfb6a713fd2ce451d4b --- /dev/null +++ b/static/eval_results/SI/Llama_3_2_11B/task_results.json @@ -0,0 +1,2207 @@ +[ + { + "name": "science_molecule_chemistry", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "signboard_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "funsd_document_qa", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "physical_property_reasoning", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_area", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_analytic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "3d_indoor_scene_text_bbox_selection", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "map_diagram_qa", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_connectivity", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "graph_isomorphism", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "question_solution_solving", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "snli_ve_visual_entailment", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "ti_fused_vqa_chemistry", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ili_ratio_future_prediction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "image_style_recognition", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "multilingual_movie_info_parsing", + "score": 0.4387755102040816, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_transformation", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "vibe_eval_short_phrase", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "math_breakpoint", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "waybill_number_sequence_extraction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "brand_logo_recognition_and_elaboration", + "score": 0.58, + "eval_type": "rule", + "num_demo": 1, + "num_query": 25 + }, + { + "name": "license_plate_recognition", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_puzzle_single_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "code_programming_test_easy", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 24 + }, + { + "name": "chess_winner_identification", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "chart_vqa", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "math_convexity_value_estimation", + "score": 0.08577420938532272, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "3d_indoor_scene_text_bbox_prediction", + "score": 0.010733939687873404, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "long_string_letter_recognition", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "dvqa", + "score": 0.42105263157894735, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "egocentric_spatial_reasoning", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "quizlet_question_solving", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "coco_person_detection", + "score": 0.11976895147024216, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_maxflow", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "tqa_textbook_qa", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "game_info_parsing", + "score": 0.8376623376623374, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "animal_pose_estimation", + "score": 0.10497817448547724, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "humor_understand_caption_match", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "table_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "super_clevr", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "average_humidity_estimate_plot", + "score": 0.0933333333333334, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "nlvr2_two_image_compare_qa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "face_keypoint_detection", + "score": 0.2431787841758548, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "insect_order_classification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "deciphering_oracle_bone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "places365_scene_type_classification", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "science_basic_physics", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "long_string_number_recognition", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "handwritten_math_expression_extraction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "newspaper_ocr_in_query_box", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "graph_theory", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "landmark_recognition_and_qa", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "geometry_solid", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "top_rated_hotel_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "stock_info_parsing", + "score": 0.8151260504201682, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pmc_vqa_medical_image_qa", + "score": 0.42105263157894735, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "electricity_future_prediction_from_table", + "score": 0.009473684210526306, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "TV_show_info_parsing", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "coco_object_detection_by_query_property", + "score": 0.4170800923924453, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "figureqa", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ti_fused_vqa_biology", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "traffic_future_prediction_from_line_plot", + "score": 0.02831578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "mvsa_sentiment_classification", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "stock_price_future_prediction", + "score": 0.05935714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "kvqa_knowledge_aware_qa", + "score": 0.05263157894736842, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "hotel_booking_confirmation_parsing", + "score": 0.5857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_shortest_path_planar", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "math_parity", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "famous_building_recognition", + "score": 0.53125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "multilingual_game_info_parsing", + "score": 0.19642857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "weather_info_parsing", + "score": 0.5912698412698412, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_shortest_path_kamada_kawai", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "widerface_face_count_and_event_classification", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ti_fused_vqa_physics", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_chordless_cycle", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_length", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "algebra", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "human_relationship_reasoning", + "score": 0.6875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "exchange_rate_estimate_plot", + "score": 0.671125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "electricity_load_estimate_plot", + "score": 0.23564285714285718, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "single_person_pose_estimation", + "score": 0.0808793106923892, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ti_fused_vqa_math", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "electricity_plot_future_prediction", + "score": 0.3097052631578948, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "iconqa", + "score": 0.2631578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "movie_info_parsing", + "score": 0.49107142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "youtube_video_info_parsing", + "score": 0.4761904761904762, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "egocentric_analysis_single_image", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "music_info_parsing", + "score": 0.38392857142857145, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mnist_pattern", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "newspaper_page_parse_and_count", + "score": 0.28888888888888886, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "geometry_descriptive", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_relation", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_depth", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_distance", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_count", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "symbolic_graphics_programs_computer_aided_design", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "symbolic_graphics_programs_scalable_vector_graphics", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18 + }, + { + "name": "multiple_states_identify_asia", + "score": 0.2285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_americas", + "score": 0.27142857142857146, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_europe", + "score": 0.2285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_africa", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "flowchart_code_generation", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "healthcare_info_judgement", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "relative_depth_of_different_points", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_visual_storage", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "Ad_count_detection", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_sygyzy_endgames", + "score": 0.08027210884353742, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "web_action_grounding", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "position_relationship", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "GUI_Act_Web_Multi", + "score": 0.2874256645177423, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "webpage_code_understanding", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "extract_webpage_headline", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "music_sheet_note_count", + "score": 0.11764705882352941, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "geometry_reasoning_circled_letter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "logical_reasoning_find_odd_one_out", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "paper_vqa", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "remaining_playback_time_calculation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "interpret_force_perspective_illusion", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "GUI_Act_Web_Single", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "hashtag_recommendation", + "score": 0.6904761904761905, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_tyreworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "weather_map_climate_type_temperature_parsing", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "scibench_calculus_wo_solution", + "score": 0.04081632653061224, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49 + }, + { + "name": "MMMU_physics_chemistry_MCQ", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "image_translation_en2cn", + "score": 0.0452954818941379, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "game_platform_support_identification", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "music_sheet_format_QA", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cultural_vqa", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "CLEVRER_physics", + "score": 0.15, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20 + }, + { + "name": "location_vqa", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_nested_squares", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "circuit_diagram_understanding", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "scibench_fundamental_wo_solution", + "score": 0.04081632653061224, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49 + }, + { + "name": "medical_cell_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "planning_screenshot_blocksworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "planning_screenshot_storage", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "distinguish_ai_generated_image", + "score": 0.2631578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "geometry_reasoning_count_line_intersections", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_grippers", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "llavaguard", + "score": 0.4642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mindmap_elements_parsing", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "autonomous_driving_scene_analysis", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "realworld_qa_en2cn", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "orchestra_score_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "highest_discount_game_price_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "counting", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "media_QA_web_stackoverflow", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "soccer_offside", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "song_title_identification_from_lyrics", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMMU_pro_exam_screenshot", + "score": 0.10101010101010101, + "eval_type": "rule", + "num_demo": 1, + "num_query": 99 + }, + { + "name": "medical_multi_organ_segmentation_rater", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "knowledge_graph_understanding", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "mensa_iq_test", + "score": 0.25735294117647056, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "play_go_capture_stone", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "medical_blood_vessels_recognition", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "web_action_prediction", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "arxiv_vqa", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "top_video_creator_identification", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "annoying_word_search", + "score": 0.005791505791505792, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "GUI_Act_Mobile_tap", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_grid", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "GUI_Act_Mobile_swipe", + "score": 0.5361188092942067, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "recover_masked_word_in_figure", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "character_recognition_in_TV_shows", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "relative_reflectance_of_different_regions", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_image_artifacts_indentification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "signage_navigation", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "transit_map_intersection_points", + "score": 0.002976190476190476, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "code_execution", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "icon_arithmetic_puzzle", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_overlapped_circle", + "score": 0.32142857142857145, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chinese_idiom_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "monthly_weather_days_count", + "score": 0.2380952380952381, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "calendar_schedule_suggestion", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ascii_art_understanding", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_find_legal_moves", + "score": 0.0013736263736263737, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "topological_sort", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "entertainment_web_game_style", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "polygon_interior_angles", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mahjong", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "font_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "number_comparison", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_counting_lymphocytes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "actor_recognition_in_Movie", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "product_ocr_qa", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "Bongard_Problem", + "score": 0.2894736842105263, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "logical_reasoning_fit_pattern", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "road_map_find_highway_between_two_place", + "score": 0.47058823529411764, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "ishihara_test", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "LaTeX_complex_formula_convertion", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "rebus", + "score": 0.13043478260869565, + "eval_type": "rule", + "num_demo": 1, + "num_query": 23 + }, + { + "name": "constrained_generation_contain_position_length", + "score": 0.4, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_repeat_length", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_contain_length", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_length", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "chess_puzzles_equality", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "chess_puzzles_crushing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_puzzles_checkmate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "panel_images_single_question", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "panel_images_multi_question", + "score": 0.7142857142857144, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "crossword_mini_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_HatefulMemes", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Misinformation_GossipCop", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Misinformation_PolitiFact", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Memotion", + "score": 0.6705882352941177, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "reward_models_I2T_reward", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_MATH", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "ocr_math_TheoremQA", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "counterfactual_arithmetic", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_homepage", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_publication", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_blogpost", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_complex_question_answering", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_fact_verification", + "score": 0.16666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_face_swap", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_text_style", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_clip_stable_diffusion_generate", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_veracity", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_face_attribute_edit", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_text_entity_replace", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_out_of_context", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_background_change", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "number_puzzle_kakuro_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "number_puzzle_sudoku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "maze_2d_8x8", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "shape_composition_shapes", + "score": 0.16071428571428573, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "shape_composition_colours", + "score": 0.09835600907029478, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_chinese_celebrity", + "score": 0.17857142857142858, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_indian_celebrity", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_papers", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "memorization_famous_treaty", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_csv", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_school_plain", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_text_latex", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_skill_plain", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_experience_plain", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_article_authors", + "score": 0.6071428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_markdown", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_html", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_employer_plain", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_equation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_article_journal", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_latex", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "autorater_artifact", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "autorater_artifact_reason", + "score": 0.4, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_haiku", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_acrostic", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_limerick", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_petrarchian_sonnet_optional_meter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_custom_rhyming_scheme", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_acrostic_alliteration", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_shakespearean_sonnet", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "pictionary_cartoon_drawing_guess", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_doodle_guess", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "pictionary_skribbl_io", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20 + }, + { + "name": "pictionary_genai_output_chinese", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_chinese_food_img2en", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_tiktok", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_notes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_alipay", + "score": 0.058823529411764705, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "app_layout_understanding_amazon", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_instagram", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_ppt", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_leetcode", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_twitter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_zoom", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "app_layout_understanding_iphone_settings", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_youtube", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_excel", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_word", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "scibench_w_solution_open_ended", + "score": 0.1, + "eval_type": "llm", + "num_demo": 1, + "num_query": 25 + }, + { + "name": "electrocardiogram", + "score": 0.03571428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "funny_image_title", + "score": 0.5642857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "image_captioning_with_additional_requirements", + "score": 0.33571428571428574, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "meme_explain", + "score": 0.15, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "image_humor_understanding", + "score": 0.5758620689655174, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "tweets_captioning", + "score": 0.5571428571428572, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "defeasible_reasoning", + "score": 0.5620689655172414, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "vibe-eval", + "score": 0.4428571428571429, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocrqa", + "score": 0.6275862068965519, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "figurative_speech_explanation", + "score": 0.5586206896551724, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "docci_image_description_long", + "score": 0.5928571428571429, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bar_chart_interpretation", + "score": 0.5000000000000001, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "iq_test", + "score": 0.3482758620689655, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "unusual_images", + "score": 0.4724137931034484, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "GUI_Chat_Hard", + "score": 0.38064516129032255, + "eval_type": "llm", + "num_demo": 1, + "num_query": 31 + }, + { + "name": "graph_interpretation", + "score": 0.20344827586206896, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "traffic_accident_analysis", + "score": 0.7, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "humor_explanation", + "score": 0.4666666666666667, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "GUI_Chat_Easy", + "score": 0.4115384615384616, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26 + }, + { + "name": "table2latex_complex", + "score": 0.23333333333333328, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "visualization_with_code", + "score": 0.4642857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "science_figure_explanation", + "score": 0.3551724137931035, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "guess_image_generation_prompt", + "score": 0.7263157894736842, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "art_explanation", + "score": 0.410344827586207, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "bridge_strategies_worldclass", + "score": 0.16428571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bridge_strategies_expert", + "score": 0.1142857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bridge_strategies_advanced", + "score": 0.021428571428571432, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Spanish", + "score": 0.014285714285714287, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Arabic", + "score": 0.19999999999999998, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Japanese", + "score": 0.15, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Russian", + "score": 0.03571428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_English", + "score": 0.07857142857142858, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_French", + "score": 0.028571428571428574, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_fetaqa", + "score": 0.3, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "red_teaming_racial", + "score": 0.74, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_captcha", + "score": 0.18421052631578955, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "red_teaming_celebrity", + "score": 0.9, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_politics", + "score": 0.7, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_visualmisleading", + "score": 0.8473684210526319, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "red_teaming_jailbreak", + "score": 0.8200000000000001, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "ascii_art_30", + "score": 0.14285714285714285, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + } +] \ No newline at end of file diff --git a/static/eval_results/SI/MiniCPM_v2.6/summary_results.json b/static/eval_results/SI/MiniCPM_v2.6/summary_results.json new file mode 100644 index 0000000000000000000000000000000000000000..1e91edbcb5931b29cb88fcb6d0990c607e10cb5f --- /dev/null +++ b/static/eval_results/SI/MiniCPM_v2.6/summary_results.json @@ -0,0 +1,219 @@ +{ + "model_summary": { + "core": { + "num_eval_tasks": 273, + "num_eval_samples": 4116, + "num_not_eval_samples": 0, + "macro_mean_score": 0.23230765810722817, + "micro_mean_score": 0.22684118052665975, + "missing_tasks": [] + }, + "open": { + "num_eval_tasks": 42, + "num_eval_samples": 813, + "num_not_eval_samples": 0, + "macro_mean_score": 0.4360655066213874, + "micro_mean_score": 0.4588560885608856, + "missing_tasks": [] + }, + "overall_score": 0.2594753712424494 + }, + "keyword_stats": { + "skills": { + "Text Recognition (OCR)": { + "count": 101, + "num_samples": 1687, + "tasks": [], + "average_score": 0.26814713591233313 + }, + "Object Recognition and Classification": { + "count": 172, + "num_samples": 2714, + "tasks": [], + "average_score": 0.2657183000752527 + }, + "Scene and Event Understanding": { + "count": 60, + "num_samples": 1004, + "tasks": [], + "average_score": 0.3977302205205499 + }, + "Mathematical and Logical Reasoning": { + "count": 91, + "num_samples": 1630, + "tasks": [], + "average_score": 0.18352505380246076 + }, + "Language Understanding and Generation": { + "count": 102, + "num_samples": 1713, + "tasks": [], + "average_score": 0.3045977370408878 + }, + "Domain-Specific Knowledge and Skills": { + "count": 46, + "num_samples": 897, + "tasks": [], + "average_score": 0.2244713686485571 + }, + "Spatial and Temporal Reasoning": { + "count": 78, + "num_samples": 1273, + "tasks": [], + "average_score": 0.17375496033997198 + }, + "Planning and Decision Making": { + "count": 23, + "num_samples": 356, + "tasks": [], + "average_score": 0.06087615859328559 + }, + "Commonsense and Social Reasoning": { + "count": 38, + "num_samples": 654, + "tasks": [], + "average_score": 0.45156722842924535 + }, + "Ethical and Safety Reasoning": { + "count": 10, + "num_samples": 170, + "tasks": [], + "average_score": 0.660718045112782 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 67, + "num_samples": 1123, + "tasks": [], + "average_score": 0.21066692306852683 + }, + "Text-Based Images and Documents": { + "count": 53, + "num_samples": 847, + "tasks": [], + "average_score": 0.17128318830807052 + }, + "Photographs": { + "count": 83, + "num_samples": 1315, + "tasks": [], + "average_score": 0.3681846956052881 + }, + "Diagrams and Data Visualizations": { + "count": 88, + "num_samples": 1524, + "tasks": [], + "average_score": 0.23021362338817897 + }, + "Artistic and Creative Content": { + "count": 22, + "num_samples": 389, + "tasks": [], + "average_score": 0.34994481629202306 + }, + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.012567281814686655 + } + }, + "output_format": { + "structured_output": { + "count": 72, + "num_samples": 1121, + "tasks": [], + "average_score": 0.20284349423013687 + }, + "contextual_formatted_text": { + "count": 63, + "num_samples": 975, + "tasks": [], + "average_score": 0.23679437883858215 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.21540007432647457 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.2036075191422558 + }, + "open_ended_output": { + "count": 51, + "num_samples": 991, + "tasks": [], + "average_score": 0.3711731498662282 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.39586776859504136 + } + }, + "input_num": { + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.25947537124244935 + } + }, + "app": { + "Information_Extraction": { + "count": 41, + "num_samples": 644, + "tasks": [], + "average_score": 0.21340553041678637 + }, + "Planning": { + "count": 44, + "num_samples": 714, + "tasks": [], + "average_score": 0.07517089101065139 + }, + "Coding": { + "count": 16, + "num_samples": 244, + "tasks": [], + "average_score": 0.20497125933706817 + }, + "Perception": { + "count": 82, + "num_samples": 1321, + "tasks": [], + "average_score": 0.3620762837308124 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.3507936507936508 + }, + "Science": { + "count": 22, + "num_samples": 469, + "tasks": [], + "average_score": 0.25260048981169975 + }, + "Knowledge": { + "count": 77, + "num_samples": 1294, + "tasks": [], + "average_score": 0.33417132133610217 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.14556723677922526 + } + } + } +} \ No newline at end of file diff --git a/static/eval_results/SI/MiniCPM_v2.6/task_results.json b/static/eval_results/SI/MiniCPM_v2.6/task_results.json new file mode 100644 index 0000000000000000000000000000000000000000..4fd3adbb982b548e0ba82627ef2680b85d85542a --- /dev/null +++ b/static/eval_results/SI/MiniCPM_v2.6/task_results.json @@ -0,0 +1,2207 @@ +[ + { + "name": "location_vqa", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "scibench_calculus_wo_solution", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49 + }, + { + "name": "logical_reasoning_find_odd_one_out", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "relative_depth_of_different_points", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "signage_navigation", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "road_map_find_highway_between_two_place", + "score": 0.29411764705882354, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "logical_reasoning_fit_pattern", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "entertainment_web_game_style", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "game_platform_support_identification", + "score": 0.10714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ascii_art_understanding", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "scibench_fundamental_wo_solution", + "score": 0.02040816326530612, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49 + }, + { + "name": "GUI_Act_Mobile_swipe", + "score": 0.30216436328431096, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "extract_webpage_headline", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "distinguish_ai_generated_image", + "score": 0.631578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "geometry_reasoning_nested_squares", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_cell_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_image_artifacts_indentification", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_tyreworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "geometry_reasoning_overlapped_circle", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "monthly_weather_days_count", + "score": 0.2619047619047619, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "CLEVRER_physics", + "score": 0.3, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20 + }, + { + "name": "GUI_Act_Mobile_tap", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "character_recognition_in_TV_shows", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "circuit_diagram_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "music_sheet_note_count", + "score": 0.11764705882352941, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "MMMU_physics_chemistry_MCQ", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_find_legal_moves", + "score": 0.031192609502066617, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_visual_storage", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "medical_counting_lymphocytes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "code_execution", + "score": 0.0625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "planning_screenshot_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "chess_sygyzy_endgames", + "score": 0.08571428571428573, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cultural_vqa", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "hashtag_recommendation", + "score": 0.7261904761904762, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_grid", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "annoying_word_search", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "healthcare_info_judgement", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "webpage_code_understanding", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "font_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "relative_reflectance_of_different_regions", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "calendar_schedule_suggestion", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mensa_iq_test", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "medical_multi_organ_segmentation_rater", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "product_ocr_qa", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "counting", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_blocksworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "MMMU_pro_exam_screenshot", + "score": 0.1414141414141414, + "eval_type": "rule", + "num_demo": 1, + "num_query": 99 + }, + { + "name": "chinese_idiom_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ishihara_test", + "score": 0.17142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "Ad_count_detection", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "number_comparison", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "song_title_identification_from_lyrics", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "paper_vqa", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "polygon_interior_angles", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "play_go_capture_stone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "mindmap_elements_parsing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_storage", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "rebus", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 23 + }, + { + "name": "music_sheet_format_QA", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "autonomous_driving_scene_analysis", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "transit_map_intersection_points", + "score": 0.03214285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mahjong", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "LaTeX_complex_formula_convertion", + "score": 0.23529411764705882, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "GUI_Act_Web_Single", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "image_translation_en2cn", + "score": 0.03365154769951505, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "actor_recognition_in_Movie", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_blood_vessels_recognition", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "remaining_playback_time_calculation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "realworld_qa_en2cn", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "soccer_offside", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "recover_masked_word_in_figure", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "top_video_creator_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "web_action_grounding", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "arxiv_vqa", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "interpret_force_perspective_illusion", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "orchestra_score_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "GUI_Act_Web_Multi", + "score": 0.1786994367639529, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "llavaguard", + "score": 0.6785714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "icon_arithmetic_puzzle", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_count_line_intersections", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "web_action_prediction", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "knowledge_graph_understanding", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "position_relationship", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "Bongard_Problem", + "score": 0.23684210526315788, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "geometry_reasoning_circled_letter", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "flowchart_code_generation", + "score": 0.2222222222222222, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "topological_sort", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_grippers", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "weather_map_climate_type_temperature_parsing", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "highest_discount_game_price_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "media_QA_web_stackoverflow", + "score": 0.380952380952381, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "newspaper_ocr_in_query_box", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "electricity_plot_future_prediction", + "score": 0.26673684210526316, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "coco_person_detection", + "score": 0.43479972951988266, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_length", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "weather_info_parsing", + "score": 0.38888888888888884, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "physical_property_reasoning", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "face_keypoint_detection", + "score": 0.5208819022936791, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_maxflow", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "geometry_transformation", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_puzzle_single_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "ili_ratio_future_prediction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_analytic", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mvsa_sentiment_classification", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_theory", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "3d_indoor_scene_text_bbox_selection", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "funsd_document_qa", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "single_person_pose_estimation", + "score": 0.09114179383067332, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "deciphering_oracle_bone", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "license_plate_recognition", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "science_molecule_chemistry", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "snli_ve_visual_entailment", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "table_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_solid", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "tqa_textbook_qa", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "music_info_parsing", + "score": 0.33035714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_area", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "dvqa", + "score": 0.8421052631578947, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "widerface_face_count_and_event_classification", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "long_string_letter_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_winner_identification", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "math_convexity_value_estimation", + "score": 0.3512276296925473, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "ti_fused_vqa_math", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "human_relationship_reasoning", + "score": 0.8125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "vibe_eval_short_phrase", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "question_solution_solving", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "game_info_parsing", + "score": 0.7467532467532468, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "top_rated_hotel_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "stock_price_future_prediction", + "score": 0.3992857142857144, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "algebra", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_shortest_path_planar", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "map_diagram_qa", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_shortest_path_kamada_kawai", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_chordless_cycle", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "waybill_number_sequence_extraction", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "iconqa", + "score": 0.3157894736842105, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "graph_connectivity", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "mnist_pattern", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "egocentric_analysis_single_image", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "image_style_recognition", + "score": 0.8666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "geometry_descriptive", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "electricity_load_estimate_plot", + "score": 0.16192857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "insect_order_classification", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "ti_fused_vqa_biology", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "youtube_video_info_parsing", + "score": 0.4642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "quizlet_question_solving", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ti_fused_vqa_chemistry", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "exchange_rate_estimate_plot", + "score": 0.8309857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multilingual_movie_info_parsing", + "score": 0.18367346938775508, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "hotel_booking_confirmation_parsing", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "code_programming_test_easy", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 24 + }, + { + "name": "brand_logo_recognition_and_elaboration", + "score": 0.26, + "eval_type": "rule", + "num_demo": 1, + "num_query": 25 + }, + { + "name": "movie_info_parsing", + "score": 0.4642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "3d_indoor_scene_text_bbox_prediction", + "score": 0.02513456362937331, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "TV_show_info_parsing", + "score": 0.49999999999999994, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "stock_info_parsing", + "score": 0.6890756302521008, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "math_parity", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "famous_building_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "egocentric_spatial_reasoning", + "score": 0.2222222222222222, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "nlvr2_two_image_compare_qa", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_isomorphism", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "landmark_recognition_and_qa", + "score": 0.28888888888888886, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "handwritten_math_expression_extraction", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ti_fused_vqa_physics", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pmc_vqa_medical_image_qa", + "score": 0.631578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "long_string_number_recognition", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "electricity_future_prediction_from_table", + "score": 0.4542105263157895, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "places365_scene_type_classification", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "super_clevr", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "animal_pose_estimation", + "score": 0.17992339728368958, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "science_basic_physics", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "kvqa_knowledge_aware_qa", + "score": 0.42105263157894735, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "math_breakpoint", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "newspaper_page_parse_and_count", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "multilingual_game_info_parsing", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "signboard_identification", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "chart_vqa", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "average_humidity_estimate_plot", + "score": 0.24133333333333346, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "humor_understand_caption_match", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "figureqa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "coco_object_detection_by_query_property", + "score": 0.5079713114698053, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "traffic_future_prediction_from_line_plot", + "score": 0.0750526315789474, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "chess_puzzles_crushing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_puzzles_equality", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "chess_puzzles_checkmate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_papers", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "memorization_indian_celebrity", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_chinese_celebrity", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_famous_treaty", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "reward_models_I2T_reward", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "crossword_mini_5x5", + "score": 0.0071428571428571435, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "number_puzzle_sudoku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "number_puzzle_kakuro_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "panel_images_single_question", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "panel_images_multi_question", + "score": 0.6904761904761905, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_complex_question_answering", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_fact_verification", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "constrained_generation_contain_repeat_length", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_contain_length", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_length", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_position_length", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "maze_2d_8x8", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Misinformation_PolitiFact", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Misinformation_GossipCop", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_HatefulMemes", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Memotion", + "score": 0.7058823529411766, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "ocr_table_to_latex", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_csv", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_markdown", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_experience_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_employer_plain", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_article_authors", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_article_journal", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_skill_plain", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_text_latex", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_equation", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_html", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_school_plain", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_doodle_guess", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "pictionary_chinese_food_img2en", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_cartoon_drawing_guess", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_genai_output_chinese", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_skribbl_io", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20 + }, + { + "name": "counterfactual_arithmetic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_youtube", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_iphone_settings", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_notes", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_word", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_alipay", + "score": 0.17647058823529413, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "app_layout_understanding_twitter", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_leetcode", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_tiktok", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_excel", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_instagram", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_ppt", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_zoom", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "app_layout_understanding_amazon", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_face_attribute_edit", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_out_of_context", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_background_change", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_text_entity_replace", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_clip_stable_diffusion_generate", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_text_style", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_veracity", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_face_swap", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_blogpost", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_homepage", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_publication", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "autorater_artifact_reason", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "autorater_artifact", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "shape_composition_shapes", + "score": 0.2202380952380952, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "shape_composition_colours", + "score": 0.09450113378684807, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_TheoremQA", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_MATH", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "poetry_petrarchian_sonnet_optional_meter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_shakespearean_sonnet", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_custom_rhyming_scheme", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_haiku", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_acrostic_alliteration", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_limerick", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_acrostic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "symbolic_graphics_programs_computer_aided_design", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "symbolic_graphics_programs_scalable_vector_graphics", + "score": 0.05555555555555555, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18 + }, + { + "name": "cvbench_adapted_cvbench_depth", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_relation", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_distance", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_count", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_africa", + "score": 0.12857142857142856, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_asia", + "score": 0.042857142857142864, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_americas", + "score": 0.18571428571428572, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_europe", + "score": 0.042857142857142864, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "figurative_speech_explanation", + "score": 0.5896551724137933, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "tweets_captioning", + "score": 0.4071428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "science_figure_explanation", + "score": 0.4689655172413793, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "iq_test", + "score": 0.2586206896551724, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "image_captioning_with_additional_requirements", + "score": 0.7785714285714287, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "visualization_with_code", + "score": 0.22142857142857145, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocrqa", + "score": 0.7103448275862069, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "table2latex_complex", + "score": 0.41111111111111115, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "graph_interpretation", + "score": 0.5793103448275863, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "defeasible_reasoning", + "score": 0.5586206896551725, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "image_humor_understanding", + "score": 0.5206896551724138, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "bar_chart_interpretation", + "score": 0.393103448275862, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "humor_explanation", + "score": 0.5400000000000001, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "vibe-eval", + "score": 0.49285714285714277, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "art_explanation", + "score": 0.4206896551724137, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "traffic_accident_analysis", + "score": 0.06428571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "electrocardiogram", + "score": 0.1928571428571429, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "GUI_Chat_Easy", + "score": 0.6923076923076923, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26 + }, + { + "name": "funny_image_title", + "score": 0.4928571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "scibench_w_solution_open_ended", + "score": 0.09, + "eval_type": "llm", + "num_demo": 1, + "num_query": 25 + }, + { + "name": "meme_explain", + "score": 0.46428571428571425, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "GUI_Chat_Hard", + "score": 0.5322580645161291, + "eval_type": "llm", + "num_demo": 1, + "num_query": 31 + }, + { + "name": "unusual_images", + "score": 0.5827586206896552, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "guess_image_generation_prompt", + "score": 0.768421052631579, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "docci_image_description_long", + "score": 0.5857142857142856, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ascii_art_30", + "score": 0.2857142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Spanish", + "score": 0.3928571428571429, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_French", + "score": 0.35, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Russian", + "score": 0.16428571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_English", + "score": 0.37857142857142856, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Arabic", + "score": 0.20714285714285716, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Japanese", + "score": 0.2571428571428572, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_fetaqa", + "score": 0.42142857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bridge_strategies_advanced", + "score": 0.14285714285714285, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bridge_strategies_expert", + "score": 0.37857142857142856, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bridge_strategies_worldclass", + "score": 0.07142857142857142, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "red_teaming_captcha", + "score": 0.10000000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "red_teaming_visualmisleading", + "score": 0.8578947368421055, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "red_teaming_celebrity", + "score": 0.555, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_politics", + "score": 0.7100000000000001, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_jailbreak", + "score": 0.48999999999999994, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_racial", + "score": 0.7349999999999999, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + } +] \ No newline at end of file diff --git a/static/eval_results/SI/Molmo_72B/summary_results.json b/static/eval_results/SI/Molmo_72B/summary_results.json new file mode 100644 index 0000000000000000000000000000000000000000..67df8a5ceb069e1926824e120e649ddeba93073b --- /dev/null +++ b/static/eval_results/SI/Molmo_72B/summary_results.json @@ -0,0 +1,223 @@ +{ + "model_summary": { + "core": { + "num_eval_tasks": 270, + "num_eval_samples": 4073, + "num_not_eval_samples": 0, + "macro_mean_score": 0.36480000609384927, + "micro_mean_score": 0.36205779758110807, + "missing_tasks": [ + "planning_screenshot_termes", + "table_understanding", + "MMSoc_Misinformation_PolitiFact" + ] + }, + "open": { + "num_eval_tasks": 42, + "num_eval_samples": 813, + "num_not_eval_samples": 0, + "macro_mean_score": 0.4465682063915481, + "micro_mean_score": 0.4850553505535054, + "missing_tasks": [] + }, + "overall_score": 0.3758072638262318 + }, + "keyword_stats": { + "skills": { + "Text Recognition (OCR)": { + "count": 101, + "num_samples": 1672, + "tasks": [], + "average_score": 0.41462128751753047 + }, + "Object Recognition and Classification": { + "count": 172, + "num_samples": 2714, + "tasks": [], + "average_score": 0.4223762317042425 + }, + "Scene and Event Understanding": { + "count": 60, + "num_samples": 1004, + "tasks": [], + "average_score": 0.5756984198310193 + }, + "Mathematical and Logical Reasoning": { + "count": 91, + "num_samples": 1615, + "tasks": [], + "average_score": 0.2983397150768741 + }, + "Language Understanding and Generation": { + "count": 102, + "num_samples": 1698, + "tasks": [], + "average_score": 0.4110431137367615 + }, + "Domain-Specific Knowledge and Skills": { + "count": 46, + "num_samples": 897, + "tasks": [], + "average_score": 0.3070615049173117 + }, + "Spatial and Temporal Reasoning": { + "count": 78, + "num_samples": 1257, + "tasks": [], + "average_score": 0.29197652844726363 + }, + "Planning and Decision Making": { + "count": 23, + "num_samples": 340, + "tasks": [], + "average_score": 0.07825953913967484 + }, + "Commonsense and Social Reasoning": { + "count": 38, + "num_samples": 639, + "tasks": [], + "average_score": 0.445412976139552 + }, + "Ethical and Safety Reasoning": { + "count": 10, + "num_samples": 155, + "tasks": [], + "average_score": 0.5953120300751881 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 67, + "num_samples": 1107, + "tasks": [], + "average_score": 0.3754692399771127 + }, + "Text-Based Images and Documents": { + "count": 53, + "num_samples": 847, + "tasks": [], + "average_score": 0.24743187516175363 + }, + "Photographs": { + "count": 83, + "num_samples": 1300, + "tasks": [], + "average_score": 0.47052754190598134 + }, + "Diagrams and Data Visualizations": { + "count": 88, + "num_samples": 1509, + "tasks": [], + "average_score": 0.35176591065677537 + }, + "Artistic and Creative Content": { + "count": 22, + "num_samples": 389, + "tasks": [], + "average_score": 0.4405271103381682 + }, + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.18757766329699532 + } + }, + "output_format": { + "structured_output": { + "count": 72, + "num_samples": 1105, + "tasks": [], + "average_score": 0.3670439889863054 + }, + "contextual_formatted_text": { + "count": 63, + "num_samples": 975, + "tasks": [], + "average_score": 0.3048441329189725 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.36443166533642163 + }, + "numerical_data": { + "count": 39, + "num_samples": 679, + "tasks": [], + "average_score": 0.3342330361070466 + }, + "open_ended_output": { + "count": 51, + "num_samples": 991, + "tasks": [], + "average_score": 0.4120820025247545 + }, + "multiple_choice": { + "count": 33, + "num_samples": 552, + "tasks": [], + "average_score": 0.5421225239407056 + } + }, + "input_num": { + "1-image": { + "count": 315, + "num_samples": 5182, + "tasks": [], + "average_score": 0.3757024328002091 + } + }, + "app": { + "Information_Extraction": { + "count": 41, + "num_samples": 644, + "tasks": [], + "average_score": 0.5042591723808818 + }, + "Planning": { + "count": 44, + "num_samples": 698, + "tasks": [], + "average_score": 0.13015529062282669 + }, + "Coding": { + "count": 16, + "num_samples": 244, + "tasks": [], + "average_score": 0.2582151610644257 + }, + "Perception": { + "count": 82, + "num_samples": 1306, + "tasks": [], + "average_score": 0.4704848349431393 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.6714285714285714 + }, + "Science": { + "count": 22, + "num_samples": 469, + "tasks": [], + "average_score": 0.3557374102316002 + }, + "Knowledge": { + "count": 77, + "num_samples": 1279, + "tasks": [], + "average_score": 0.39648868632862583 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.2954490282663994 + } + } + } +} \ No newline at end of file diff --git a/static/eval_results/SI/Molmo_72B/task_results.json b/static/eval_results/SI/Molmo_72B/task_results.json new file mode 100644 index 0000000000000000000000000000000000000000..5afd5f91add75e7356970c4840d0ccf634eb7ede --- /dev/null +++ b/static/eval_results/SI/Molmo_72B/task_results.json @@ -0,0 +1,2207 @@ +[ + { + "name": "science_molecule_chemistry", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "signboard_identification", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "funsd_document_qa", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "physical_property_reasoning", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_area", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_analytic", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_connectivity", + "score": 0.3, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "graph_isomorphism", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "question_solution_solving", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "snli_ve_visual_entailment", + "score": 0.8666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "ti_fused_vqa_chemistry", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ili_ratio_future_prediction", + "score": 0.09950000000000002, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "image_style_recognition", + "score": 0.8666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "geometry_transformation", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "vibe_eval_short_phrase", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "brand_logo_recognition_and_elaboration", + "score": 0.64, + "eval_type": "rule", + "num_demo": 1, + "num_query": 25 + }, + { + "name": "license_plate_recognition", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_puzzle_single_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "code_programming_test_easy", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 24 + }, + { + "name": "chess_winner_identification", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "chart_vqa", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "math_convexity_value_estimation", + "score": 0.3362778655358422, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "3d_indoor_scene_text_bbox_prediction", + "score": 0.1608696123082764, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "long_string_letter_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "dvqa", + "score": 0.8421052631578947, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "quizlet_question_solving", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "coco_person_detection", + "score": 0.5787232350283793, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_maxflow", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "game_info_parsing", + "score": 0.8246753246753247, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "animal_pose_estimation", + "score": 0.2665355246878068, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "humor_understand_caption_match", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "super_clevr", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "average_humidity_estimate_plot", + "score": 0.31399999999999995, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "nlvr2_two_image_compare_qa", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "insect_order_classification", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "deciphering_oracle_bone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "places365_scene_type_classification", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "science_basic_physics", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "long_string_number_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "newspaper_ocr_in_query_box", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "graph_theory", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "landmark_recognition_and_qa", + "score": 0.31111111111111106, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "geometry_solid", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "top_rated_hotel_identification", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "stock_info_parsing", + "score": 0.8529411764705882, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pmc_vqa_medical_image_qa", + "score": 0.5263157894736842, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "electricity_future_prediction_from_table", + "score": 0.6192105263157894, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "TV_show_info_parsing", + "score": 0.7539682539682541, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "coco_object_detection_by_query_property", + "score": 0.4786650057305392, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "figureqa", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ti_fused_vqa_biology", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "traffic_future_prediction_from_line_plot", + "score": 0.530263157894737, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "mvsa_sentiment_classification", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "stock_price_future_prediction", + "score": 0.6355000000000001, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "hotel_booking_confirmation_parsing", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_shortest_path_planar", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "math_parity", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "famous_building_recognition", + "score": 0.875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "weather_info_parsing", + "score": 0.6468253968253969, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_shortest_path_kamada_kawai", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "widerface_face_count_and_event_classification", + "score": 0.4642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_chordless_cycle", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "human_relationship_reasoning", + "score": 0.75, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "exchange_rate_estimate_plot", + "score": 0.7636999999999999, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "electricity_load_estimate_plot", + "score": 0.5474285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "single_person_pose_estimation", + "score": 0.21567342533242703, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ti_fused_vqa_math", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "electricity_plot_future_prediction", + "score": 0.7032473684210526, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "iconqa", + "score": 0.21052631578947367, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "movie_info_parsing", + "score": 0.6339285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "youtube_video_info_parsing", + "score": 0.47619047619047616, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "egocentric_analysis_single_image", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "music_info_parsing", + "score": 0.4642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "newspaper_page_parse_and_count", + "score": 0.5555555555555555, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "geometry_descriptive", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_relation", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_depth", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_count", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "symbolic_graphics_programs_computer_aided_design", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "symbolic_graphics_programs_scalable_vector_graphics", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18 + }, + { + "name": "multiple_states_identify_europe", + "score": 0.17142857142857146, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_africa", + "score": 0.042857142857142864, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "flowchart_code_generation", + "score": 0.2222222222222222, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "healthcare_info_judgement", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "relative_depth_of_different_points", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "Ad_count_detection", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_sygyzy_endgames", + "score": 0.047619047619047616, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "web_action_grounding", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "GUI_Act_Web_Multi", + "score": 0.4642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "webpage_code_understanding", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "extract_webpage_headline", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "music_sheet_note_count", + "score": 0.058823529411764705, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "geometry_reasoning_circled_letter", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "logical_reasoning_find_odd_one_out", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "remaining_playback_time_calculation", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "interpret_force_perspective_illusion", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "planning_screenshot_tyreworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "weather_map_climate_type_temperature_parsing", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "scibench_calculus_wo_solution", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49 + }, + { + "name": "game_platform_support_identification", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "music_sheet_format_QA", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "CLEVRER_physics", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20 + }, + { + "name": "location_vqa", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_nested_squares", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "circuit_diagram_understanding", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "planning_screenshot_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "planning_screenshot_blocksworld", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "planning_screenshot_storage", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "distinguish_ai_generated_image", + "score": 0.5263157894736842, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "geometry_reasoning_count_line_intersections", + "score": 0.4642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_grippers", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "llavaguard", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mindmap_elements_parsing", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "map_diagram_qa", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multilingual_movie_info_parsing", + "score": 0.5102040816326531, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "math_breakpoint", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "waybill_number_sequence_extraction", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "egocentric_spatial_reasoning", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "face_keypoint_detection", + "score": 0.5583866383665127, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ti_fused_vqa_physics", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_length", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mnist_pattern", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_distance", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_asia", + "score": 0.028571428571428574, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_americas", + "score": 0.24285714285714288, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_visual_storage", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "position_relationship", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "paper_vqa", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "GUI_Act_Web_Single", + "score": 0.020673971469919374, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "hashtag_recommendation", + "score": 0.8869047619047619, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "scibench_fundamental_wo_solution", + "score": 0.1836734693877551, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49 + }, + { + "name": "autonomous_driving_scene_analysis", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "orchestra_score_recognition", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "highest_discount_game_price_identification", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "counting", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "media_QA_web_stackoverflow", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "soccer_offside", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "song_title_identification_from_lyrics", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMMU_pro_exam_screenshot", + "score": 0.24242424242424243, + "eval_type": "rule", + "num_demo": 1, + "num_query": 99 + }, + { + "name": "medical_multi_organ_segmentation_rater", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "knowledge_graph_understanding", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "mensa_iq_test", + "score": 0.39558823529411763, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "play_go_capture_stone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "web_action_prediction", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "top_video_creator_identification", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "GUI_Act_Mobile_tap", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_grid", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "GUI_Act_Mobile_swipe", + "score": 0.32955157882083963, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "recover_masked_word_in_figure", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "relative_reflectance_of_different_regions", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "signage_navigation", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "transit_map_intersection_points", + "score": 0.24404761904761907, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "code_execution", + "score": 0.125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "icon_arithmetic_puzzle", + "score": 0.39285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_overlapped_circle", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chinese_idiom_recognition", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "monthly_weather_days_count", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "calendar_schedule_suggestion", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ascii_art_understanding", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_find_legal_moves", + "score": 0.030696230874143155, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "topological_sort", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "entertainment_web_game_style", + "score": 0.75, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "polygon_interior_angles", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mahjong", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "font_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "number_comparison", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "actor_recognition_in_Movie", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "product_ocr_qa", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "Bongard_Problem", + "score": 0.10526315789473684, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "logical_reasoning_fit_pattern", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "road_map_find_highway_between_two_place", + "score": 0.29411764705882354, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "ishihara_test", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "LaTeX_complex_formula_convertion", + "score": 0.058823529411764705, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "rebus", + "score": 0.21739130434782608, + "eval_type": "rule", + "num_demo": 1, + "num_query": 23 + }, + { + "name": "constrained_generation_contain_position_length", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_repeat_length", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_contain_length", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_length", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "chess_puzzles_equality", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "chess_puzzles_crushing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "panel_images_single_question", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "panel_images_multi_question", + "score": 0.7619047619047619, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_HatefulMemes", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Misinformation_GossipCop", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_blood_vessels_recognition", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_cell_recognition", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_image_artifacts_indentification", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_counting_lymphocytes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Memotion", + "score": 0.6000000000000001, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "reward_models_I2T_reward", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_MATH", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "ocr_math_TheoremQA", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "counterfactual_arithmetic", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_homepage", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_publication", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_complex_question_answering", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_fact_verification", + "score": 0.9047619047619048, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_face_swap", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_text_style", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_clip_stable_diffusion_generate", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_veracity", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_face_attribute_edit", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_text_entity_replace", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_out_of_context", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_background_change", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "number_puzzle_sudoku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "shape_composition_shapes", + "score": 0.2074829931972789, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "shape_composition_colours", + "score": 0.3755385487528345, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_chinese_celebrity", + "score": 0.10714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_indian_celebrity", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_papers", + "score": 0.16666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "memorization_famous_treaty", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_csv", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_school_plain", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_text_latex", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_skill_plain", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_experience_plain", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_article_authors", + "score": 0.39285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_markdown", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_html", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_employer_plain", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_equation", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_article_journal", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_latex", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "autorater_artifact", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "autorater_artifact_reason", + "score": 0.8, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_haiku", + "score": 0.4, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_acrostic", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_limerick", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_petrarchian_sonnet_optional_meter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_custom_rhyming_scheme", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_acrostic_alliteration", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_shakespearean_sonnet", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "pictionary_cartoon_drawing_guess", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_doodle_guess", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "pictionary_skribbl_io", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20 + }, + { + "name": "pictionary_genai_output_chinese", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_chinese_food_img2en", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_tiktok", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_alipay", + "score": 0.5294117647058824, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "app_layout_understanding_amazon", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_instagram", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_ppt", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_leetcode", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_twitter", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_zoom", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "app_layout_understanding_iphone_settings", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_youtube", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_excel", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_word", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "tqa_textbook_qa", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "kvqa_knowledge_aware_qa", + "score": 0.21052631578947367, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "cultural_vqa", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "character_recognition_in_TV_shows", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "3d_indoor_scene_text_bbox_selection", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "handwritten_math_expression_extraction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMMU_physics_chemistry_MCQ", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "realworld_qa_en2cn", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "arxiv_vqa", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_notes", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multilingual_game_info_parsing", + "score": 0.23214285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "algebra", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "image_translation_en2cn", + "score": 0.09073936194509774, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "annoying_word_search", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "chess_puzzles_checkmate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "crossword_mini_5x5", + "score": 0.18571428571428572, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_blogpost", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "number_puzzle_kakuro_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "maze_2d_8x8", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 0 + }, + { + "name": "table_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 0 + }, + { + "name": "MMSoc_Misinformation_PolitiFact", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 0 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_English", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_French", + "score": 0.0071428571428571435, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_fetaqa", + "score": 0.6857142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "red_teaming_racial", + "score": 0.7400000000000001, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_captcha", + "score": 0.10000000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "red_teaming_celebrity", + "score": 0.54, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_politics", + "score": 0.7000000000000001, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_visualmisleading", + "score": 0.905263157894737, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "red_teaming_jailbreak", + "score": 0.5149999999999999, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "ascii_art_30", + "score": 0.35714285714285715, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "scibench_w_solution_open_ended", + "score": 0.05, + "eval_type": "llm", + "num_demo": 1, + "num_query": 25 + }, + { + "name": "electrocardiogram", + "score": 0.2857142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "funny_image_title", + "score": 0.6285714285714284, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "image_captioning_with_additional_requirements", + "score": 0.7000000000000001, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "meme_explain", + "score": 0.7071428571428572, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "image_humor_understanding", + "score": 0.7241379310344828, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "tweets_captioning", + "score": 0.5285714285714285, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "defeasible_reasoning", + "score": 0.6862068965517241, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "vibe-eval", + "score": 0.5214285714285715, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocrqa", + "score": 0.6862068965517241, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "figurative_speech_explanation", + "score": 0.7827586206896552, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "docci_image_description_long", + "score": 0.7071428571428572, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bar_chart_interpretation", + "score": 0.5310344827586206, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "iq_test", + "score": 0.25862068965517243, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "unusual_images", + "score": 0.6827586206896552, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "GUI_Chat_Hard", + "score": 0.629032258064516, + "eval_type": "llm", + "num_demo": 1, + "num_query": 31 + }, + { + "name": "graph_interpretation", + "score": 0.42758620689655175, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "traffic_accident_analysis", + "score": 0.09285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "humor_explanation", + "score": 0.6533333333333334, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "GUI_Chat_Easy", + "score": 0.7461538461538463, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26 + }, + { + "name": "table2latex_complex", + "score": 0.3111111111111111, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "visualization_with_code", + "score": 0.27142857142857146, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "science_figure_explanation", + "score": 0.6275862068965516, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "guess_image_generation_prompt", + "score": 0.8263157894736842, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "art_explanation", + "score": 0.4827586206896552, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "bridge_strategies_worldclass", + "score": 0.0642857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bridge_strategies_expert", + "score": 0.2857142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bridge_strategies_advanced", + "score": 0.14285714285714285, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Spanish", + "score": 0.049999999999999996, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Arabic", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Japanese", + "score": 0.028571428571428574, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Russian", + "score": 0.08571428571428572, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + } +] \ No newline at end of file diff --git a/static/eval_results/SI/Molmo_7B_D/summary_results.json b/static/eval_results/SI/Molmo_7B_D/summary_results.json new file mode 100644 index 0000000000000000000000000000000000000000..a75e051b5c4fd6a1eb5c021faa7ba3bfd17d761d --- /dev/null +++ b/static/eval_results/SI/Molmo_7B_D/summary_results.json @@ -0,0 +1,221 @@ +{ + "model_summary": { + "core": { + "num_eval_tasks": 272, + "num_eval_samples": 4102, + "num_not_eval_samples": 0, + "macro_mean_score": 0.2098088446992518, + "micro_mean_score": 0.20550929661464645, + "missing_tasks": [ + "MMSoc_Misinformation_PolitiFact" + ] + }, + "open": { + "num_eval_tasks": 42, + "num_eval_samples": 813, + "num_not_eval_samples": 0, + "macro_mean_score": 0.35697926179118733, + "micro_mean_score": 0.38936039360393604, + "missing_tasks": [] + }, + "overall_score": 0.22949405972428777 + }, + "keyword_stats": { + "skills": { + "Text Recognition (OCR)": { + "count": 101, + "num_samples": 1687, + "tasks": [], + "average_score": 0.2239160707791646 + }, + "Object Recognition and Classification": { + "count": 172, + "num_samples": 2714, + "tasks": [], + "average_score": 0.24958564675030656 + }, + "Scene and Event Understanding": { + "count": 60, + "num_samples": 1004, + "tasks": [], + "average_score": 0.35830528296805764 + }, + "Mathematical and Logical Reasoning": { + "count": 91, + "num_samples": 1630, + "tasks": [], + "average_score": 0.17259103199957743 + }, + "Language Understanding and Generation": { + "count": 102, + "num_samples": 1698, + "tasks": [], + "average_score": 0.25685172601108597 + }, + "Domain-Specific Knowledge and Skills": { + "count": 46, + "num_samples": 897, + "tasks": [], + "average_score": 0.19244928377978027 + }, + "Spatial and Temporal Reasoning": { + "count": 78, + "num_samples": 1273, + "tasks": [], + "average_score": 0.1939605648275455 + }, + "Planning and Decision Making": { + "count": 23, + "num_samples": 356, + "tasks": [], + "average_score": 0.035588894400059294 + }, + "Commonsense and Social Reasoning": { + "count": 38, + "num_samples": 639, + "tasks": [], + "average_score": 0.32356781790614975 + }, + "Ethical and Safety Reasoning": { + "count": 10, + "num_samples": 155, + "tasks": [], + "average_score": 0.4433947368421053 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 67, + "num_samples": 1123, + "tasks": [], + "average_score": 0.18796442406686825 + }, + "Text-Based Images and Documents": { + "count": 53, + "num_samples": 847, + "tasks": [], + "average_score": 0.1327652104313917 + }, + "Photographs": { + "count": 83, + "num_samples": 1300, + "tasks": [], + "average_score": 0.323282724310645 + }, + "Diagrams and Data Visualizations": { + "count": 88, + "num_samples": 1524, + "tasks": [], + "average_score": 0.21356852314768052 + }, + "Artistic and Creative Content": { + "count": 22, + "num_samples": 389, + "tasks": [], + "average_score": 0.3106093738160722 + }, + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.09043432702433757 + } + }, + "output_format": { + "structured_output": { + "count": 72, + "num_samples": 1121, + "tasks": [], + "average_score": 0.21610753722787088 + }, + "contextual_formatted_text": { + "count": 63, + "num_samples": 975, + "tasks": [], + "average_score": 0.17305260714177756 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.17907829453546903 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.22086240998395923 + }, + "open_ended_output": { + "count": 51, + "num_samples": 991, + "tasks": [], + "average_score": 0.324079404512755 + }, + "multiple_choice": { + "count": 33, + "num_samples": 552, + "tasks": [], + "average_score": 0.3169618260527351 + } + }, + "input_num": { + "1-image": { + "count": 315, + "num_samples": 5213, + "tasks": [], + "average_score": 0.22943156697817663 + } + }, + "app": { + "Information_Extraction": { + "count": 41, + "num_samples": 644, + "tasks": [], + "average_score": 0.27184002856754413 + }, + "Planning": { + "count": 44, + "num_samples": 714, + "tasks": [], + "average_score": 0.06424366688759706 + }, + "Coding": { + "count": 16, + "num_samples": 244, + "tasks": [], + "average_score": 0.1158110119047619 + }, + "Perception": { + "count": 82, + "num_samples": 1321, + "tasks": [], + "average_score": 0.30311570603428784 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.3619047619047619 + }, + "Science": { + "count": 22, + "num_samples": 469, + "tasks": [], + "average_score": 0.173722800705029 + }, + "Knowledge": { + "count": 77, + "num_samples": 1279, + "tasks": [], + "average_score": 0.2787344822161389 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.1740048655548875 + } + } + } +} \ No newline at end of file diff --git a/static/eval_results/SI/Molmo_7B_D/task_results.json b/static/eval_results/SI/Molmo_7B_D/task_results.json new file mode 100644 index 0000000000000000000000000000000000000000..0bbe3028db39138ab0ace362dae451ffeb4d4922 --- /dev/null +++ b/static/eval_results/SI/Molmo_7B_D/task_results.json @@ -0,0 +1,2207 @@ +[ + { + "name": "science_molecule_chemistry", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "signboard_identification", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "funsd_document_qa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "physical_property_reasoning", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_area", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_analytic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "map_diagram_qa", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_connectivity", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "question_solution_solving", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "snli_ve_visual_entailment", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "ili_ratio_future_prediction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "image_style_recognition", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "multilingual_movie_info_parsing", + "score": 0.3877551020408164, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_transformation", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "vibe_eval_short_phrase", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "math_breakpoint", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "waybill_number_sequence_extraction", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "brand_logo_recognition_and_elaboration", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 25 + }, + { + "name": "license_plate_recognition", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "code_programming_test_easy", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 24 + }, + { + "name": "chart_vqa", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "math_convexity_value_estimation", + "score": 0.313862400588624, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "3d_indoor_scene_text_bbox_prediction", + "score": 0.038011511191532274, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_isomorphism", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "ti_fused_vqa_chemistry", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_puzzle_single_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "chess_winner_identification", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "long_string_letter_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "dvqa", + "score": 0.5263157894736842, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "egocentric_spatial_reasoning", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "quizlet_question_solving", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "coco_person_detection", + "score": 0.41069776234840943, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_maxflow", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "game_info_parsing", + "score": 0.7662337662337662, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "animal_pose_estimation", + "score": 0.02719542601438354, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "humor_understand_caption_match", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "super_clevr", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "average_humidity_estimate_plot", + "score": 0.48066666666666674, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "nlvr2_two_image_compare_qa", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "face_keypoint_detection", + "score": 0.6552623858210546, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "insect_order_classification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "deciphering_oracle_bone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "places365_scene_type_classification", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "science_basic_physics", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "long_string_number_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "newspaper_ocr_in_query_box", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "graph_theory", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "landmark_recognition_and_qa", + "score": 0.08888888888888888, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "geometry_solid", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "top_rated_hotel_identification", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "stock_info_parsing", + "score": 0.8277310924369747, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pmc_vqa_medical_image_qa", + "score": 0.3157894736842105, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "electricity_future_prediction_from_table", + "score": 0.5805263157894738, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "TV_show_info_parsing", + "score": 0.43650793650793646, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "coco_object_detection_by_query_property", + "score": 0.26186238411043455, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "figureqa", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ti_fused_vqa_biology", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "traffic_future_prediction_from_line_plot", + "score": 0.38000000000000006, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "mvsa_sentiment_classification", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "stock_price_future_prediction", + "score": 0.47507142857142853, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "hotel_booking_confirmation_parsing", + "score": 0.33571428571428574, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_shortest_path_planar", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "math_parity", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "famous_building_recognition", + "score": 0.21875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "multilingual_game_info_parsing", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "weather_info_parsing", + "score": 0.5634920634920635, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_shortest_path_kamada_kawai", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "widerface_face_count_and_event_classification", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ti_fused_vqa_physics", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_chordless_cycle", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_length", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "algebra", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "human_relationship_reasoning", + "score": 0.8125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "exchange_rate_estimate_plot", + "score": 0.6044642857142856, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "electricity_load_estimate_plot", + "score": 0.42121428571428565, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "single_person_pose_estimation", + "score": 0.060991464591085905, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ti_fused_vqa_math", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "electricity_plot_future_prediction", + "score": 0.3600894736842105, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "iconqa", + "score": 0.10526315789473684, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "movie_info_parsing", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "youtube_video_info_parsing", + "score": 0.34523809523809523, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "egocentric_analysis_single_image", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "music_info_parsing", + "score": 0.2767857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mnist_pattern", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "newspaper_page_parse_and_count", + "score": 0.4888888888888889, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "geometry_descriptive", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_relation", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_depth", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_distance", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_count", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "symbolic_graphics_programs_computer_aided_design", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "symbolic_graphics_programs_scalable_vector_graphics", + "score": 0.05555555555555555, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18 + }, + { + "name": "multiple_states_identify_asia", + "score": 0.042857142857142864, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_americas", + "score": 0.2571428571428572, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_europe", + "score": 0.24285714285714288, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_africa", + "score": 0.1, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "flowchart_code_generation", + "score": 0.1111111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "healthcare_info_judgement", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "relative_depth_of_different_points", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_visual_storage", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "Ad_count_detection", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_sygyzy_endgames", + "score": 0.01677018633540373, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "web_action_grounding", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "position_relationship", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "GUI_Act_Web_Multi", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "webpage_code_understanding", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "extract_webpage_headline", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "music_sheet_note_count", + "score": 0.058823529411764705, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "geometry_reasoning_circled_letter", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "logical_reasoning_find_odd_one_out", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "paper_vqa", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "remaining_playback_time_calculation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "interpret_force_perspective_illusion", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "GUI_Act_Web_Single", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "hashtag_recommendation", + "score": 0.7261904761904762, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_tyreworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "weather_map_climate_type_temperature_parsing", + "score": 0.6071428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "scibench_calculus_wo_solution", + "score": 0.12244897959183673, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49 + }, + { + "name": "image_translation_en2cn", + "score": 0.015030224651396001, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "game_platform_support_identification", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "music_sheet_format_QA", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "CLEVRER_physics", + "score": 0.3, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20 + }, + { + "name": "location_vqa", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_nested_squares", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "circuit_diagram_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "scibench_fundamental_wo_solution", + "score": 0.04081632653061224, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49 + }, + { + "name": "planning_screenshot_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "planning_screenshot_blocksworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "planning_screenshot_storage", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "distinguish_ai_generated_image", + "score": 0.5789473684210527, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "geometry_reasoning_count_line_intersections", + "score": 0.5357142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_grippers", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "llavaguard", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mindmap_elements_parsing", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "autonomous_driving_scene_analysis", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "orchestra_score_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "highest_discount_game_price_identification", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "counting", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "media_QA_web_stackoverflow", + "score": 0.16666666666666669, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "soccer_offside", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "song_title_identification_from_lyrics", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMMU_pro_exam_screenshot", + "score": 0.10101010101010101, + "eval_type": "rule", + "num_demo": 1, + "num_query": 99 + }, + { + "name": "medical_multi_organ_segmentation_rater", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "knowledge_graph_understanding", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "mensa_iq_test", + "score": 0.13725490196078433, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "play_go_capture_stone", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "web_action_prediction", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "top_video_creator_identification", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "annoying_word_search", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "GUI_Act_Mobile_tap", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_grid", + "score": 0.4642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "GUI_Act_Mobile_swipe", + "score": 0.10903008480749887, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "recover_masked_word_in_figure", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "relative_reflectance_of_different_regions", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "signage_navigation", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "transit_map_intersection_points", + "score": 0.125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "code_execution", + "score": 0.0625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "icon_arithmetic_puzzle", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_overlapped_circle", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chinese_idiom_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "monthly_weather_days_count", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "calendar_schedule_suggestion", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ascii_art_understanding", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_find_legal_moves", + "score": 0.014172871487032616, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "topological_sort", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "entertainment_web_game_style", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "polygon_interior_angles", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mahjong", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "font_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "number_comparison", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "actor_recognition_in_Movie", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "product_ocr_qa", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "Bongard_Problem", + "score": 0.09649122807017543, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "logical_reasoning_fit_pattern", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "road_map_find_highway_between_two_place", + "score": 0.17647058823529413, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "ishihara_test", + "score": 0.21904761904761907, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "LaTeX_complex_formula_convertion", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "rebus", + "score": 0.043478260869565216, + "eval_type": "rule", + "num_demo": 1, + "num_query": 23 + }, + { + "name": "constrained_generation_contain_position_length", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_repeat_length", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_contain_length", + "score": 0.6, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_length", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "chess_puzzles_equality", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "chess_puzzles_crushing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_puzzles_checkmate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "panel_images_single_question", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "panel_images_multi_question", + "score": 0.619047619047619, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "crossword_mini_5x5", + "score": 0.08571428571428572, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_HatefulMemes", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Misinformation_GossipCop", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_blood_vessels_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_cell_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_image_artifacts_indentification", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_counting_lymphocytes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Memotion", + "score": 0.5411764705882353, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "reward_models_I2T_reward", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_MATH", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "ocr_math_TheoremQA", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "counterfactual_arithmetic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_homepage", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_publication", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_blogpost", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_complex_question_answering", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_fact_verification", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_face_swap", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_text_style", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_clip_stable_diffusion_generate", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_veracity", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_face_attribute_edit", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_text_entity_replace", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_out_of_context", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_background_change", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "number_puzzle_kakuro_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "number_puzzle_sudoku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "maze_2d_8x8", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "shape_composition_shapes", + "score": 0.15858843537414966, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "shape_composition_colours", + "score": 0.1503684807256236, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_chinese_celebrity", + "score": 0.10714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_indian_celebrity", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_papers", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "memorization_famous_treaty", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_csv", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_school_plain", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_text_latex", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_skill_plain", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_experience_plain", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_article_authors", + "score": 0.32142857142857145, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_markdown", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_html", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_employer_plain", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_equation", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_article_journal", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_latex", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "autorater_artifact", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "autorater_artifact_reason", + "score": 0.8, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_haiku", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_acrostic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_limerick", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_petrarchian_sonnet_optional_meter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_custom_rhyming_scheme", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_acrostic_alliteration", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_shakespearean_sonnet", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "pictionary_cartoon_drawing_guess", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_doodle_guess", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "pictionary_skribbl_io", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20 + }, + { + "name": "pictionary_genai_output_chinese", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_chinese_food_img2en", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_tiktok", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_alipay", + "score": 0.29411764705882354, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "app_layout_understanding_amazon", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_instagram", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_ppt", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_leetcode", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_twitter", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_zoom", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "app_layout_understanding_iphone_settings", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_youtube", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_excel", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_word", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "tqa_textbook_qa", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "kvqa_knowledge_aware_qa", + "score": 0.10526315789473684, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "cultural_vqa", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "character_recognition_in_TV_shows", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "3d_indoor_scene_text_bbox_selection", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "handwritten_math_expression_extraction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMMU_physics_chemistry_MCQ", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "realworld_qa_en2cn", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "arxiv_vqa", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_notes", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "table_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Misinformation_PolitiFact", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 0 + }, + { + "name": "scibench_w_solution_open_ended", + "score": 0.15, + "eval_type": "llm", + "num_demo": 1, + "num_query": 25 + }, + { + "name": "electrocardiogram", + "score": 0.2857142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "funny_image_title", + "score": 0.43571428571428567, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "image_captioning_with_additional_requirements", + "score": 0.1285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "meme_explain", + "score": 0.5214285714285715, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "image_humor_understanding", + "score": 0.3586206896551723, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "tweets_captioning", + "score": 0.45714285714285713, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "defeasible_reasoning", + "score": 0.5896551724137932, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "vibe-eval", + "score": 0.4714285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocrqa", + "score": 0.6517241379310346, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "figurative_speech_explanation", + "score": 0.703448275862069, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "docci_image_description_long", + "score": 0.5214285714285715, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bar_chart_interpretation", + "score": 0.35862068965517246, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "iq_test", + "score": 0.19999999999999998, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "unusual_images", + "score": 0.4379310344827586, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "GUI_Chat_Hard", + "score": 0.5967741935483872, + "eval_type": "llm", + "num_demo": 1, + "num_query": 31 + }, + { + "name": "graph_interpretation", + "score": 0.23103448275862065, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "traffic_accident_analysis", + "score": 0.2, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "humor_explanation", + "score": 0.4, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "GUI_Chat_Easy", + "score": 0.6730769230769231, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26 + }, + { + "name": "table2latex_complex", + "score": 0.05555555555555555, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "visualization_with_code", + "score": 0.09999999999999999, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "science_figure_explanation", + "score": 0.31379310344827577, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "guess_image_generation_prompt", + "score": 0.7789473684210527, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "art_explanation", + "score": 0.4, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "bridge_strategies_worldclass", + "score": 0.07142857142857142, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bridge_strategies_expert", + "score": 0.35714285714285715, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bridge_strategies_advanced", + "score": 0.049999999999999996, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Spanish", + "score": 0.0071428571428571435, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Arabic", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Japanese", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Russian", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_English", + "score": 0.0071428571428571435, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_French", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_fetaqa", + "score": 0.4928571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "red_teaming_racial", + "score": 0.7150000000000001, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_captcha", + "score": 0.10000000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "red_teaming_celebrity", + "score": 0.71, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_politics", + "score": 0.6849999999999998, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_visualmisleading", + "score": 0.8789473684210528, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "red_teaming_jailbreak", + "score": 0.255, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "ascii_art_30", + "score": 0.6428571428571429, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + } +] \ No newline at end of file diff --git a/static/eval_results/SI/NVLM/summary_results.json b/static/eval_results/SI/NVLM/summary_results.json new file mode 100644 index 0000000000000000000000000000000000000000..45c4dacafa84da381f3a9b804029c98426e57384 --- /dev/null +++ b/static/eval_results/SI/NVLM/summary_results.json @@ -0,0 +1,219 @@ +{ + "model_summary": { + "core": { + "num_eval_tasks": 273, + "num_eval_samples": 4116, + "num_not_eval_samples": 0, + "macro_mean_score": 0.32989872890926025, + "micro_mean_score": 0.32315683713111915, + "missing_tasks": [] + }, + "open": { + "num_eval_tasks": 42, + "num_eval_samples": 813, + "num_not_eval_samples": 0, + "macro_mean_score": 0.4469349818134809, + "micro_mean_score": 0.4881303813038132, + "missing_tasks": [] + }, + "overall_score": 0.34550356262982296 + }, + "keyword_stats": { + "skills": { + "Text Recognition (OCR)": { + "count": 101, + "num_samples": 1687, + "tasks": [], + "average_score": 0.3943476764428869 + }, + "Object Recognition and Classification": { + "count": 172, + "num_samples": 2714, + "tasks": [], + "average_score": 0.3359094293956291 + }, + "Scene and Event Understanding": { + "count": 60, + "num_samples": 1004, + "tasks": [], + "average_score": 0.46386896656934745 + }, + "Mathematical and Logical Reasoning": { + "count": 91, + "num_samples": 1630, + "tasks": [], + "average_score": 0.30043411704099793 + }, + "Language Understanding and Generation": { + "count": 102, + "num_samples": 1713, + "tasks": [], + "average_score": 0.38986101015677044 + }, + "Domain-Specific Knowledge and Skills": { + "count": 46, + "num_samples": 897, + "tasks": [], + "average_score": 0.3152573721587561 + }, + "Spatial and Temporal Reasoning": { + "count": 78, + "num_samples": 1273, + "tasks": [], + "average_score": 0.26907670581189963 + }, + "Planning and Decision Making": { + "count": 23, + "num_samples": 356, + "tasks": [], + "average_score": 0.07615011020932495 + }, + "Commonsense and Social Reasoning": { + "count": 38, + "num_samples": 654, + "tasks": [], + "average_score": 0.45915496990325566 + }, + "Ethical and Safety Reasoning": { + "count": 10, + "num_samples": 170, + "tasks": [], + "average_score": 0.6521954887218044 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 67, + "num_samples": 1123, + "tasks": [], + "average_score": 0.2814148428882822 + }, + "Text-Based Images and Documents": { + "count": 53, + "num_samples": 847, + "tasks": [], + "average_score": 0.30070480033875985 + }, + "Photographs": { + "count": 83, + "num_samples": 1315, + "tasks": [], + "average_score": 0.4332099707344069 + }, + "Diagrams and Data Visualizations": { + "count": 88, + "num_samples": 1524, + "tasks": [], + "average_score": 0.32094439294995036 + }, + "Artistic and Creative Content": { + "count": 22, + "num_samples": 389, + "tasks": [], + "average_score": 0.4387718807206103 + }, + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.09447890526012262 + } + }, + "output_format": { + "structured_output": { + "count": 72, + "num_samples": 1121, + "tasks": [], + "average_score": 0.34135355449546323 + }, + "contextual_formatted_text": { + "count": 63, + "num_samples": 975, + "tasks": [], + "average_score": 0.3215154320779893 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.29287492253780084 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.28793758479482745 + }, + "open_ended_output": { + "count": 51, + "num_samples": 991, + "tasks": [], + "average_score": 0.3828322321439372 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.5016004197822379 + } + }, + "input_num": { + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.345503562629823 + } + }, + "app": { + "Information_Extraction": { + "count": 41, + "num_samples": 644, + "tasks": [], + "average_score": 0.352859881186271 + }, + "Planning": { + "count": 44, + "num_samples": 714, + "tasks": [], + "average_score": 0.1252138046141793 + }, + "Coding": { + "count": 16, + "num_samples": 244, + "tasks": [], + "average_score": 0.37153871965452856 + }, + "Perception": { + "count": 82, + "num_samples": 1321, + "tasks": [], + "average_score": 0.45079588183469294 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.3047619047619048 + }, + "Science": { + "count": 22, + "num_samples": 469, + "tasks": [], + "average_score": 0.3518857602487131 + }, + "Knowledge": { + "count": 77, + "num_samples": 1294, + "tasks": [], + "average_score": 0.37572531212341936 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.2786818799518423 + } + } + } +} \ No newline at end of file diff --git a/static/eval_results/SI/NVLM/task_results.json b/static/eval_results/SI/NVLM/task_results.json new file mode 100644 index 0000000000000000000000000000000000000000..e8e5b4debdef91683ae1d1543ab9361ef5d55cb9 --- /dev/null +++ b/static/eval_results/SI/NVLM/task_results.json @@ -0,0 +1,2207 @@ +[ + { + "name": "science_molecule_chemistry", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "signboard_identification", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "funsd_document_qa", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "physical_property_reasoning", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_area", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_analytic", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "3d_indoor_scene_text_bbox_selection", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "map_diagram_qa", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_connectivity", + "score": 0.21666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "graph_isomorphism", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "question_solution_solving", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "snli_ve_visual_entailment", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "ti_fused_vqa_chemistry", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ili_ratio_future_prediction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "image_style_recognition", + "score": 0.8666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "multilingual_movie_info_parsing", + "score": 0.5408163265306122, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_transformation", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "vibe_eval_short_phrase", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "math_breakpoint", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "waybill_number_sequence_extraction", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "brand_logo_recognition_and_elaboration", + "score": 0.54, + "eval_type": "rule", + "num_demo": 1, + "num_query": 25 + }, + { + "name": "license_plate_recognition", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_puzzle_single_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "code_programming_test_easy", + "score": 0.041666666666666664, + "eval_type": "rule", + "num_demo": 1, + "num_query": 24 + }, + { + "name": "chess_winner_identification", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "chart_vqa", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "math_convexity_value_estimation", + "score": 0.3505208453866322, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "3d_indoor_scene_text_bbox_prediction", + "score": 0.046100667663102404, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "long_string_letter_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "dvqa", + "score": 0.5263157894736842, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "egocentric_spatial_reasoning", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "quizlet_question_solving", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "coco_person_detection", + "score": 0.4733815589511575, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_maxflow", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "tqa_textbook_qa", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "game_info_parsing", + "score": 0.8896103896103894, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "animal_pose_estimation", + "score": 0.05655934095486302, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "humor_understand_caption_match", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "table_understanding", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "super_clevr", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "average_humidity_estimate_plot", + "score": 0.5693333333333332, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "nlvr2_two_image_compare_qa", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "face_keypoint_detection", + "score": 0.3056056231994156, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "insect_order_classification", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "deciphering_oracle_bone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "places365_scene_type_classification", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "science_basic_physics", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "long_string_number_recognition", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "handwritten_math_expression_extraction", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "newspaper_ocr_in_query_box", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "graph_theory", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "landmark_recognition_and_qa", + "score": 0.3777777777777777, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "geometry_solid", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "top_rated_hotel_identification", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "stock_info_parsing", + "score": 0.8823529411764708, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pmc_vqa_medical_image_qa", + "score": 0.47368421052631576, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "electricity_future_prediction_from_table", + "score": 0.5594736842105263, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "TV_show_info_parsing", + "score": 0.7142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "coco_object_detection_by_query_property", + "score": 0.40264995876467374, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "figureqa", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ti_fused_vqa_biology", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "traffic_future_prediction_from_line_plot", + "score": 0.5572631578947368, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "mvsa_sentiment_classification", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "stock_price_future_prediction", + "score": 0.3831428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "kvqa_knowledge_aware_qa", + "score": 0.21052631578947367, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "hotel_booking_confirmation_parsing", + "score": 0.5142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_shortest_path_planar", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "math_parity", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "famous_building_recognition", + "score": 0.8125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "multilingual_game_info_parsing", + "score": 0.5803571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "weather_info_parsing", + "score": 0.8412698412698412, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_shortest_path_kamada_kawai", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "widerface_face_count_and_event_classification", + "score": 0.39285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ti_fused_vqa_physics", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_chordless_cycle", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_length", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "algebra", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "human_relationship_reasoning", + "score": 0.625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "exchange_rate_estimate_plot", + "score": 0.8834571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "electricity_load_estimate_plot", + "score": 0.4170714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "single_person_pose_estimation", + "score": 0.15524366362546105, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ti_fused_vqa_math", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "electricity_plot_future_prediction", + "score": 0.39636315789473686, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "iconqa", + "score": 0.15789473684210525, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "movie_info_parsing", + "score": 0.6071428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "youtube_video_info_parsing", + "score": 0.619047619047619, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "egocentric_analysis_single_image", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "music_info_parsing", + "score": 0.44642857142857145, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mnist_pattern", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "newspaper_page_parse_and_count", + "score": 0.35555555555555557, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "geometry_descriptive", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_relation", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_depth", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_distance", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_count", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "symbolic_graphics_programs_computer_aided_design", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "symbolic_graphics_programs_scalable_vector_graphics", + "score": 0.1111111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18 + }, + { + "name": "multiple_states_identify_asia", + "score": 0.042857142857142864, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_americas", + "score": 0.11428571428571428, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_europe", + "score": 0.15714285714285717, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_africa", + "score": 0.028571428571428574, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "flowchart_code_generation", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "healthcare_info_judgement", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "relative_depth_of_different_points", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_visual_storage", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "Ad_count_detection", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_sygyzy_endgames", + "score": 0.05714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "web_action_grounding", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "position_relationship", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "GUI_Act_Web_Multi", + "score": 0.2914684343291797, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "webpage_code_understanding", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "extract_webpage_headline", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "music_sheet_note_count", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "geometry_reasoning_circled_letter", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "logical_reasoning_find_odd_one_out", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "paper_vqa", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "remaining_playback_time_calculation", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "interpret_force_perspective_illusion", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "GUI_Act_Web_Single", + "score": 0.01873586319222244, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "hashtag_recommendation", + "score": 0.8761904761904761, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_tyreworld", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "weather_map_climate_type_temperature_parsing", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "scibench_calculus_wo_solution", + "score": 0.24489795918367346, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49 + }, + { + "name": "MMMU_physics_chemistry_MCQ", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "image_translation_en2cn", + "score": 0.1238698998933831, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "game_platform_support_identification", + "score": 0.5357142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "music_sheet_format_QA", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cultural_vqa", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "CLEVRER_physics", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20 + }, + { + "name": "location_vqa", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_nested_squares", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "circuit_diagram_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "scibench_fundamental_wo_solution", + "score": 0.12244897959183673, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49 + }, + { + "name": "medical_cell_recognition", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "planning_screenshot_blocksworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "planning_screenshot_storage", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "distinguish_ai_generated_image", + "score": 0.5263157894736842, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "geometry_reasoning_count_line_intersections", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_grippers", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "llavaguard", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mindmap_elements_parsing", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "autonomous_driving_scene_analysis", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "realworld_qa_en2cn", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "orchestra_score_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "highest_discount_game_price_identification", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "counting", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "media_QA_web_stackoverflow", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "soccer_offside", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "song_title_identification_from_lyrics", + "score": 0.4642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMMU_pro_exam_screenshot", + "score": 0.20202020202020202, + "eval_type": "rule", + "num_demo": 1, + "num_query": 99 + }, + { + "name": "medical_multi_organ_segmentation_rater", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "knowledge_graph_understanding", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "mensa_iq_test", + "score": 0.2901960784313726, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "play_go_capture_stone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "medical_blood_vessels_recognition", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "web_action_prediction", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "arxiv_vqa", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "top_video_creator_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "annoying_word_search", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "GUI_Act_Mobile_tap", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_grid", + "score": 0.39285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "GUI_Act_Mobile_swipe", + "score": 0.35170464659390166, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "recover_masked_word_in_figure", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "character_recognition_in_TV_shows", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "relative_reflectance_of_different_regions", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_image_artifacts_indentification", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "signage_navigation", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "transit_map_intersection_points", + "score": 0.1488095238095238, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "code_execution", + "score": 0.1875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "icon_arithmetic_puzzle", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_overlapped_circle", + "score": 0.5357142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chinese_idiom_recognition", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "monthly_weather_days_count", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "calendar_schedule_suggestion", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ascii_art_understanding", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_find_legal_moves", + "score": 0.022876924032503295, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "topological_sort", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "entertainment_web_game_style", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "polygon_interior_angles", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mahjong", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "font_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "number_comparison", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_counting_lymphocytes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "actor_recognition_in_Movie", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "product_ocr_qa", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "Bongard_Problem", + "score": 0.23684210526315788, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "logical_reasoning_fit_pattern", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "road_map_find_highway_between_two_place", + "score": 0.4117647058823529, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "ishihara_test", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "LaTeX_complex_formula_convertion", + "score": 0.23529411764705882, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "rebus", + "score": 0.043478260869565216, + "eval_type": "rule", + "num_demo": 1, + "num_query": 23 + }, + { + "name": "constrained_generation_contain_position_length", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_repeat_length", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_contain_length", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_length", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "chess_puzzles_equality", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "chess_puzzles_crushing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_puzzles_checkmate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "panel_images_single_question", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "panel_images_multi_question", + "score": 0.6904761904761906, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "crossword_mini_5x5", + "score": 0.3, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_HatefulMemes", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Misinformation_GossipCop", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Misinformation_PolitiFact", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Memotion", + "score": 0.6000000000000001, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "reward_models_I2T_reward", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_MATH", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "ocr_math_TheoremQA", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "counterfactual_arithmetic", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_homepage", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_publication", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_blogpost", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_complex_question_answering", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_fact_verification", + "score": 0.738095238095238, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_face_swap", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_text_style", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_clip_stable_diffusion_generate", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_veracity", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_face_attribute_edit", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_text_entity_replace", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_out_of_context", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_background_change", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "number_puzzle_kakuro_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "number_puzzle_sudoku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "maze_2d_8x8", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "shape_composition_shapes", + "score": 0.13392857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "shape_composition_colours", + "score": 0.05952380952380953, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_chinese_celebrity", + "score": 0.10714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_indian_celebrity", + "score": 0.10714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_papers", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "memorization_famous_treaty", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_csv", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_school_plain", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_text_latex", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_skill_plain", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_experience_plain", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_article_authors", + "score": 0.6785714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_markdown", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_html", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_employer_plain", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_equation", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_article_journal", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_latex", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "autorater_artifact", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "autorater_artifact_reason", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_haiku", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_acrostic", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_limerick", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_petrarchian_sonnet_optional_meter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_custom_rhyming_scheme", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_acrostic_alliteration", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_shakespearean_sonnet", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "pictionary_cartoon_drawing_guess", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_doodle_guess", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "pictionary_skribbl_io", + "score": 0.05, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20 + }, + { + "name": "pictionary_genai_output_chinese", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_chinese_food_img2en", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_tiktok", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_notes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_alipay", + "score": 0.23529411764705882, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "app_layout_understanding_amazon", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_instagram", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_ppt", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_leetcode", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_twitter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_zoom", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "app_layout_understanding_iphone_settings", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_youtube", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_excel", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_word", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "scibench_w_solution_open_ended", + "score": 0.21, + "eval_type": "llm", + "num_demo": 1, + "num_query": 25 + }, + { + "name": "electrocardiogram", + "score": 0.37142857142857144, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "funny_image_title", + "score": 0.5857142857142856, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "image_captioning_with_additional_requirements", + "score": 0.6357142857142858, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "meme_explain", + "score": 0.5428571428571429, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "image_humor_understanding", + "score": 0.7310344827586209, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "tweets_captioning", + "score": 0.34285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "defeasible_reasoning", + "score": 0.7655172413793104, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "vibe-eval", + "score": 0.5857142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocrqa", + "score": 0.7172413793103448, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "figurative_speech_explanation", + "score": 0.7655172413793103, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "docci_image_description_long", + "score": 0.6714285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bar_chart_interpretation", + "score": 0.4896551724137931, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "iq_test", + "score": 0.2517241379310345, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "unusual_images", + "score": 0.7068965517241379, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "GUI_Chat_Hard", + "score": 0.4709677419354839, + "eval_type": "llm", + "num_demo": 1, + "num_query": 31 + }, + { + "name": "graph_interpretation", + "score": 0.6379310344827587, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "traffic_accident_analysis", + "score": 0.049999999999999996, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "humor_explanation", + "score": 0.6266666666666668, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "GUI_Chat_Easy", + "score": 0.726923076923077, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26 + }, + { + "name": "table2latex_complex", + "score": 0.4444444444444444, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "visualization_with_code", + "score": 0.25, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "science_figure_explanation", + "score": 0.710344827586207, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "guess_image_generation_prompt", + "score": 0.7947368421052632, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "art_explanation", + "score": 0.4827586206896553, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "bridge_strategies_worldclass", + "score": 0.1142857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bridge_strategies_expert", + "score": 0.32857142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bridge_strategies_advanced", + "score": 0.15714285714285717, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Spanish", + "score": 0.0071428571428571435, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Arabic", + "score": 0.05, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Japanese", + "score": 0.20000000000000004, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Russian", + "score": 0.049999999999999996, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_English", + "score": 0.14285714285714285, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_French", + "score": 0.05714285714285715, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_fetaqa", + "score": 0.5142857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "red_teaming_racial", + "score": 0.725, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_captcha", + "score": 0.11052631578947371, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "red_teaming_celebrity", + "score": 0.535, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_politics", + "score": 0.6649999999999999, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_visualmisleading", + "score": 0.9105263157894737, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "red_teaming_jailbreak", + "score": 0.3499999999999999, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "ascii_art_30", + "score": 0.2857142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + } +] \ No newline at end of file diff --git a/static/eval_results/SI/POINTS_15_7B/summary_results.json b/static/eval_results/SI/POINTS_15_7B/summary_results.json new file mode 100644 index 0000000000000000000000000000000000000000..644eeeb1861af7a618903bbbf3e87ee094ef44c8 --- /dev/null +++ b/static/eval_results/SI/POINTS_15_7B/summary_results.json @@ -0,0 +1,219 @@ +{ + "model_summary": { + "core": { + "num_eval_tasks": 273, + "num_eval_samples": 4116, + "num_not_eval_samples": 0, + "macro_mean_score": 0.31355970638319003, + "micro_mean_score": 0.30728203432446294, + "missing_tasks": [] + }, + "open": { + "num_eval_tasks": 42, + "num_eval_samples": 813, + "num_not_eval_samples": 0, + "macro_mean_score": 0.41331219301389166, + "micro_mean_score": 0.42749077490774917, + "missing_tasks": [] + }, + "overall_score": 0.32686003793395024 + }, + "keyword_stats": { + "skills": { + "Text Recognition (OCR)": { + "count": 101, + "num_samples": 1687, + "tasks": [], + "average_score": 0.3443899066327916 + }, + "Object Recognition and Classification": { + "count": 172, + "num_samples": 2714, + "tasks": [], + "average_score": 0.3333459246264911 + }, + "Scene and Event Understanding": { + "count": 60, + "num_samples": 1004, + "tasks": [], + "average_score": 0.43105364189963935 + }, + "Mathematical and Logical Reasoning": { + "count": 91, + "num_samples": 1630, + "tasks": [], + "average_score": 0.28961632718794406 + }, + "Language Understanding and Generation": { + "count": 102, + "num_samples": 1713, + "tasks": [], + "average_score": 0.35317851821169477 + }, + "Domain-Specific Knowledge and Skills": { + "count": 46, + "num_samples": 897, + "tasks": [], + "average_score": 0.30711751050032277 + }, + "Spatial and Temporal Reasoning": { + "count": 78, + "num_samples": 1273, + "tasks": [], + "average_score": 0.26796963300870397 + }, + "Planning and Decision Making": { + "count": 23, + "num_samples": 356, + "tasks": [], + "average_score": 0.08369131166291023 + }, + "Commonsense and Social Reasoning": { + "count": 38, + "num_samples": 654, + "tasks": [], + "average_score": 0.45980379926019677 + }, + "Ethical and Safety Reasoning": { + "count": 10, + "num_samples": 170, + "tasks": [], + "average_score": 0.6173496240601504 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 67, + "num_samples": 1123, + "tasks": [], + "average_score": 0.27713077639707523 + }, + "Text-Based Images and Documents": { + "count": 53, + "num_samples": 847, + "tasks": [], + "average_score": 0.24722440389191766 + }, + "Photographs": { + "count": 83, + "num_samples": 1315, + "tasks": [], + "average_score": 0.4276343385855984 + }, + "Diagrams and Data Visualizations": { + "count": 88, + "num_samples": 1524, + "tasks": [], + "average_score": 0.30991539183635347 + }, + "Artistic and Creative Content": { + "count": 22, + "num_samples": 389, + "tasks": [], + "average_score": 0.37330010041194067 + }, + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.15572486552610215 + } + }, + "output_format": { + "structured_output": { + "count": 72, + "num_samples": 1121, + "tasks": [], + "average_score": 0.3069044183161335 + }, + "contextual_formatted_text": { + "count": 63, + "num_samples": 975, + "tasks": [], + "average_score": 0.3101162129247054 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.2614010338203017 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.28761899055063767 + }, + "open_ended_output": { + "count": 51, + "num_samples": 991, + "tasks": [], + "average_score": 0.37619796536407 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.4855568673750491 + } + }, + "input_num": { + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.32686003793394974 + } + }, + "app": { + "Information_Extraction": { + "count": 41, + "num_samples": 644, + "tasks": [], + "average_score": 0.3095789895735217 + }, + "Planning": { + "count": 44, + "num_samples": 714, + "tasks": [], + "average_score": 0.1277481304284383 + }, + "Coding": { + "count": 16, + "num_samples": 244, + "tasks": [], + "average_score": 0.31641062675070025 + }, + "Perception": { + "count": 82, + "num_samples": 1321, + "tasks": [], + "average_score": 0.4420532221275683 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.48095238095238096 + }, + "Science": { + "count": 22, + "num_samples": 469, + "tasks": [], + "average_score": 0.32551503611448934 + }, + "Knowledge": { + "count": 77, + "num_samples": 1294, + "tasks": [], + "average_score": 0.35705988992418164 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.24128406446063128 + } + } + } +} \ No newline at end of file diff --git a/static/eval_results/SI/POINTS_15_7B/task_results.json b/static/eval_results/SI/POINTS_15_7B/task_results.json new file mode 100644 index 0000000000000000000000000000000000000000..1b4734ccbd99ebb7e2040b4f938d34fb0b03f975 --- /dev/null +++ b/static/eval_results/SI/POINTS_15_7B/task_results.json @@ -0,0 +1,2207 @@ +[ + { + "name": "location_vqa", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "scibench_calculus_wo_solution", + "score": 0.02040816326530612, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49 + }, + { + "name": "logical_reasoning_find_odd_one_out", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "relative_depth_of_different_points", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "signage_navigation", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "road_map_find_highway_between_two_place", + "score": 0.17647058823529413, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "logical_reasoning_fit_pattern", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "entertainment_web_game_style", + "score": 0.6785714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "game_platform_support_identification", + "score": 0.17857142857142858, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ascii_art_understanding", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "scibench_fundamental_wo_solution", + "score": 0.04081632653061224, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49 + }, + { + "name": "GUI_Act_Mobile_swipe", + "score": 0.5468542942842844, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "extract_webpage_headline", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "distinguish_ai_generated_image", + "score": 0.47368421052631576, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "geometry_reasoning_nested_squares", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_cell_recognition", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_image_artifacts_indentification", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_tyreworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "geometry_reasoning_overlapped_circle", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "monthly_weather_days_count", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "CLEVRER_physics", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20 + }, + { + "name": "GUI_Act_Mobile_tap", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "character_recognition_in_TV_shows", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "circuit_diagram_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "music_sheet_note_count", + "score": 0.058823529411764705, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "MMMU_physics_chemistry_MCQ", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_find_legal_moves", + "score": 0.013803101918097808, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_visual_storage", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "medical_counting_lymphocytes", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "code_execution", + "score": 0.3125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "planning_screenshot_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "chess_sygyzy_endgames", + "score": 0.06190476190476191, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cultural_vqa", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "hashtag_recommendation", + "score": 0.6488095238095238, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_grid", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "annoying_word_search", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "healthcare_info_judgement", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "webpage_code_understanding", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "font_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "relative_reflectance_of_different_regions", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "calendar_schedule_suggestion", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mensa_iq_test", + "score": 0.3696078431372549, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "medical_multi_organ_segmentation_rater", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "product_ocr_qa", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "counting", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_blocksworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "MMMU_pro_exam_screenshot", + "score": 0.24242424242424243, + "eval_type": "rule", + "num_demo": 1, + "num_query": 99 + }, + { + "name": "chinese_idiom_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ishihara_test", + "score": 0.8142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "Ad_count_detection", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "number_comparison", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "song_title_identification_from_lyrics", + "score": 0.17857142857142858, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "paper_vqa", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "polygon_interior_angles", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "play_go_capture_stone", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "mindmap_elements_parsing", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_storage", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "rebus", + "score": 0.043478260869565216, + "eval_type": "rule", + "num_demo": 1, + "num_query": 23 + }, + { + "name": "music_sheet_format_QA", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "autonomous_driving_scene_analysis", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "transit_map_intersection_points", + "score": 0.047619047619047616, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mahjong", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "LaTeX_complex_formula_convertion", + "score": 0.29411764705882354, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "GUI_Act_Web_Single", + "score": 0.0523380101397914, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "image_translation_en2cn", + "score": 0.2303962026767666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "actor_recognition_in_Movie", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_blood_vessels_recognition", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "remaining_playback_time_calculation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "realworld_qa_en2cn", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "soccer_offside", + "score": 0.2222222222222222, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "recover_masked_word_in_figure", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "top_video_creator_identification", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "web_action_grounding", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "arxiv_vqa", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "interpret_force_perspective_illusion", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "orchestra_score_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "GUI_Act_Web_Multi", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "llavaguard", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "icon_arithmetic_puzzle", + "score": 0.6071428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_count_line_intersections", + "score": 0.4642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "web_action_prediction", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "knowledge_graph_understanding", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "position_relationship", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "Bongard_Problem", + "score": 0.10526315789473684, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "geometry_reasoning_circled_letter", + "score": 0.5357142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "flowchart_code_generation", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "topological_sort", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_grippers", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "weather_map_climate_type_temperature_parsing", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "highest_discount_game_price_identification", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "media_QA_web_stackoverflow", + "score": 0.5476190476190477, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "newspaper_ocr_in_query_box", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "electricity_plot_future_prediction", + "score": 0.8338, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "coco_person_detection", + "score": 0.4642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_length", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "weather_info_parsing", + "score": 0.39285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "physical_property_reasoning", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "face_keypoint_detection", + "score": 0.6204559427628572, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_maxflow", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "geometry_transformation", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_puzzle_single_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "ili_ratio_future_prediction", + "score": 0.03178571428571437, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_analytic", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mvsa_sentiment_classification", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_theory", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "3d_indoor_scene_text_bbox_selection", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "funsd_document_qa", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "single_person_pose_estimation", + "score": 0.01108560858915326, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "deciphering_oracle_bone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "license_plate_recognition", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "science_molecule_chemistry", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "snli_ve_visual_entailment", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "table_understanding", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_solid", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "tqa_textbook_qa", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "music_info_parsing", + "score": 0.4642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_area", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "dvqa", + "score": 0.5263157894736842, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "widerface_face_count_and_event_classification", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "long_string_letter_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_winner_identification", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "math_convexity_value_estimation", + "score": 0.23277542468896867, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "ti_fused_vqa_math", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "human_relationship_reasoning", + "score": 0.9375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "vibe_eval_short_phrase", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "question_solution_solving", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "game_info_parsing", + "score": 0.8116883116883117, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "top_rated_hotel_identification", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "stock_price_future_prediction", + "score": 0.6251428571428573, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "algebra", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_shortest_path_planar", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "map_diagram_qa", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_shortest_path_kamada_kawai", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_chordless_cycle", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "waybill_number_sequence_extraction", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "iconqa", + "score": 0.42105263157894735, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "graph_connectivity", + "score": 0.16666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "mnist_pattern", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "egocentric_analysis_single_image", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "image_style_recognition", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "geometry_descriptive", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "electricity_load_estimate_plot", + "score": 0.41471428571428565, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "insect_order_classification", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "ti_fused_vqa_biology", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "youtube_video_info_parsing", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "quizlet_question_solving", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ti_fused_vqa_chemistry", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "exchange_rate_estimate_plot", + "score": 0.8822571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multilingual_movie_info_parsing", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "hotel_booking_confirmation_parsing", + "score": 0.45000000000000007, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "code_programming_test_easy", + "score": 0.20833333333333334, + "eval_type": "rule", + "num_demo": 1, + "num_query": 24 + }, + { + "name": "brand_logo_recognition_and_elaboration", + "score": 0.58, + "eval_type": "rule", + "num_demo": 1, + "num_query": 25 + }, + { + "name": "movie_info_parsing", + "score": 0.6339285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "3d_indoor_scene_text_bbox_prediction", + "score": 0.16859258819506148, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "TV_show_info_parsing", + "score": 0.6904761904761905, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "stock_info_parsing", + "score": 0.9705882352941178, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "math_parity", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "famous_building_recognition", + "score": 0.59375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "egocentric_spatial_reasoning", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "nlvr2_two_image_compare_qa", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_isomorphism", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "landmark_recognition_and_qa", + "score": 0.08888888888888888, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "handwritten_math_expression_extraction", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ti_fused_vqa_physics", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pmc_vqa_medical_image_qa", + "score": 0.5789473684210527, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "long_string_number_recognition", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "electricity_future_prediction_from_table", + "score": 0.7263157894736842, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "places365_scene_type_classification", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "super_clevr", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "animal_pose_estimation", + "score": 0.04314371819298669, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "science_basic_physics", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "kvqa_knowledge_aware_qa", + "score": 0.5263157894736842, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "math_breakpoint", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "newspaper_page_parse_and_count", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "multilingual_game_info_parsing", + "score": 0.5803571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "signboard_identification", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "chart_vqa", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "average_humidity_estimate_plot", + "score": 0.5253333333333335, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "humor_understand_caption_match", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "figureqa", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "coco_object_detection_by_query_property", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "traffic_future_prediction_from_line_plot", + "score": 0.5571578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "chess_puzzles_crushing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_puzzles_equality", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "chess_puzzles_checkmate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_papers", + "score": 0.03333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "memorization_indian_celebrity", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_chinese_celebrity", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_famous_treaty", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "reward_models_I2T_reward", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "crossword_mini_5x5", + "score": 0.014285714285714287, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "number_puzzle_sudoku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "number_puzzle_kakuro_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "panel_images_single_question", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "panel_images_multi_question", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_complex_question_answering", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_fact_verification", + "score": 0.5952380952380952, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "constrained_generation_contain_repeat_length", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_contain_length", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_length", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_position_length", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "maze_2d_8x8", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Misinformation_PolitiFact", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Misinformation_GossipCop", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_HatefulMemes", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Memotion", + "score": 0.6705882352941177, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "ocr_table_to_latex", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_csv", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_markdown", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_experience_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_employer_plain", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_article_authors", + "score": 0.4642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_article_journal", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_skill_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_text_latex", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_equation", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_html", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_school_plain", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_doodle_guess", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "pictionary_chinese_food_img2en", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_cartoon_drawing_guess", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_genai_output_chinese", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_skribbl_io", + "score": 0.05, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20 + }, + { + "name": "counterfactual_arithmetic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_youtube", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_iphone_settings", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_notes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_word", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_alipay", + "score": 0.11764705882352941, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "app_layout_understanding_twitter", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_leetcode", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_tiktok", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_excel", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_instagram", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_ppt", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_zoom", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "app_layout_understanding_amazon", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_face_attribute_edit", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_out_of_context", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_background_change", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_text_entity_replace", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_clip_stable_diffusion_generate", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_text_style", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_veracity", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_face_swap", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_blogpost", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_homepage", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_publication", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "autorater_artifact_reason", + "score": 0.8, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "autorater_artifact", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "shape_composition_shapes", + "score": 0.22023809523809526, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "shape_composition_colours", + "score": 0.23013038548752834, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_TheoremQA", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_MATH", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "poetry_petrarchian_sonnet_optional_meter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_shakespearean_sonnet", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_custom_rhyming_scheme", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_haiku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_acrostic_alliteration", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_limerick", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_acrostic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "symbolic_graphics_programs_computer_aided_design", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "symbolic_graphics_programs_scalable_vector_graphics", + "score": 0.1111111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18 + }, + { + "name": "cvbench_adapted_cvbench_depth", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_relation", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_distance", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_count", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_africa", + "score": 0.042857142857142864, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_asia", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_americas", + "score": 0.09999999999999999, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_europe", + "score": 0.028571428571428574, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "figurative_speech_explanation", + "score": 0.5310344827586206, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "tweets_captioning", + "score": 0.557142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "science_figure_explanation", + "score": 0.4241379310344827, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "iq_test", + "score": 0.2586206896551724, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "image_captioning_with_additional_requirements", + "score": 0.7500000000000001, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "visualization_with_code", + "score": 0.2857142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocrqa", + "score": 0.6034482758620691, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "table2latex_complex", + "score": 0.47777777777777786, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "graph_interpretation", + "score": 0.45517241379310336, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "defeasible_reasoning", + "score": 0.4620689655172413, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "image_humor_understanding", + "score": 0.4344827586206897, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "bar_chart_interpretation", + "score": 0.34137931034482766, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "humor_explanation", + "score": 0.28, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "vibe-eval", + "score": 0.5285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "art_explanation", + "score": 0.3551724137931033, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "traffic_accident_analysis", + "score": 0.37857142857142856, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "electrocardiogram", + "score": 0.049999999999999996, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "GUI_Chat_Easy", + "score": 0.6846153846153845, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26 + }, + { + "name": "funny_image_title", + "score": 0.6357142857142858, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "scibench_w_solution_open_ended", + "score": 0.24200000000000002, + "eval_type": "llm", + "num_demo": 1, + "num_query": 25 + }, + { + "name": "meme_explain", + "score": 0.5571428571428572, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "GUI_Chat_Hard", + "score": 0.5516129032258066, + "eval_type": "llm", + "num_demo": 1, + "num_query": 31 + }, + { + "name": "unusual_images", + "score": 0.4517241379310345, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "guess_image_generation_prompt", + "score": 0.736842105263158, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "docci_image_description_long", + "score": 0.6285714285714284, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ascii_art_30", + "score": 0.14285714285714285, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Spanish", + "score": 0.2357142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_French", + "score": 0.2857142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Russian", + "score": 0.1642857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_English", + "score": 0.3642857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Arabic", + "score": 0.31428571428571433, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Japanese", + "score": 0.1642857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_fetaqa", + "score": 0.3928571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bridge_strategies_advanced", + "score": 0.1357142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bridge_strategies_expert", + "score": 0.25000000000000006, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bridge_strategies_worldclass", + "score": 0.09285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "red_teaming_captcha", + "score": 0.11052631578947371, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "red_teaming_visualmisleading", + "score": 0.8842105263157897, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "red_teaming_celebrity", + "score": 0.6849999999999998, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_politics", + "score": 0.5199999999999999, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_jailbreak", + "score": 0.3, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_racial", + "score": 0.6549999999999999, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + } +] \ No newline at end of file diff --git a/static/eval_results/SI/POINTS_7B/summary_results.json b/static/eval_results/SI/POINTS_7B/summary_results.json new file mode 100644 index 0000000000000000000000000000000000000000..8db3edd07587d8cd344d3781063797b53af6eae2 --- /dev/null +++ b/static/eval_results/SI/POINTS_7B/summary_results.json @@ -0,0 +1,219 @@ +{ + "model_summary": { + "core": { + "num_eval_tasks": 273, + "num_eval_samples": 4116, + "num_not_eval_samples": 0, + "macro_mean_score": 0.25511317681632334, + "micro_mean_score": 0.24927711632415062, + "missing_tasks": [] + }, + "open": { + "num_eval_tasks": 42, + "num_eval_samples": 813, + "num_not_eval_samples": 0, + "macro_mean_score": 0.30315625179016, + "micro_mean_score": 0.3313653136531366, + "missing_tasks": [] + }, + "overall_score": 0.26151892014616823 + }, + "keyword_stats": { + "skills": { + "Text Recognition (OCR)": { + "count": 101, + "num_samples": 1687, + "tasks": [], + "average_score": 0.2684488868499041 + }, + "Object Recognition and Classification": { + "count": 172, + "num_samples": 2714, + "tasks": [], + "average_score": 0.27890902837062037 + }, + "Scene and Event Understanding": { + "count": 60, + "num_samples": 1004, + "tasks": [], + "average_score": 0.37373928195086786 + }, + "Mathematical and Logical Reasoning": { + "count": 91, + "num_samples": 1630, + "tasks": [], + "average_score": 0.22387504020162652 + }, + "Language Understanding and Generation": { + "count": 102, + "num_samples": 1713, + "tasks": [], + "average_score": 0.2799740367463896 + }, + "Domain-Specific Knowledge and Skills": { + "count": 46, + "num_samples": 897, + "tasks": [], + "average_score": 0.21311917080615544 + }, + "Spatial and Temporal Reasoning": { + "count": 78, + "num_samples": 1273, + "tasks": [], + "average_score": 0.21857370538972226 + }, + "Planning and Decision Making": { + "count": 23, + "num_samples": 356, + "tasks": [], + "average_score": 0.06502747891786666 + }, + "Commonsense and Social Reasoning": { + "count": 38, + "num_samples": 654, + "tasks": [], + "average_score": 0.36827291874151846 + }, + "Ethical and Safety Reasoning": { + "count": 10, + "num_samples": 170, + "tasks": [], + "average_score": 0.48204135338345855 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 67, + "num_samples": 1123, + "tasks": [], + "average_score": 0.18689735511962233 + }, + "Text-Based Images and Documents": { + "count": 53, + "num_samples": 847, + "tasks": [], + "average_score": 0.19332242733156837 + }, + "Photographs": { + "count": 83, + "num_samples": 1315, + "tasks": [], + "average_score": 0.3523684400745285 + }, + "Diagrams and Data Visualizations": { + "count": 88, + "num_samples": 1524, + "tasks": [], + "average_score": 0.25684059763242745 + }, + "Artistic and Creative Content": { + "count": 22, + "num_samples": 389, + "tasks": [], + "average_score": 0.33916980654110634 + }, + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.1499797713556708 + } + }, + "output_format": { + "structured_output": { + "count": 72, + "num_samples": 1121, + "tasks": [], + "average_score": 0.2320749867881998 + }, + "contextual_formatted_text": { + "count": 63, + "num_samples": 975, + "tasks": [], + "average_score": 0.23004840221723208 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.239982641771955 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.23646374895042882 + }, + "open_ended_output": { + "count": 51, + "num_samples": 991, + "tasks": [], + "average_score": 0.28263350209672056 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.4200183654729108 + } + }, + "input_num": { + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.2615189201461682 + } + }, + "app": { + "Information_Extraction": { + "count": 41, + "num_samples": 644, + "tasks": [], + "average_score": 0.22503259387671015 + }, + "Planning": { + "count": 44, + "num_samples": 714, + "tasks": [], + "average_score": 0.08476480516686397 + }, + "Coding": { + "count": 16, + "num_samples": 244, + "tasks": [], + "average_score": 0.3151282387955181 + }, + "Perception": { + "count": 82, + "num_samples": 1321, + "tasks": [], + "average_score": 0.3737263982731003 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.34761904761904755 + }, + "Science": { + "count": 22, + "num_samples": 469, + "tasks": [], + "average_score": 0.2606187882141402 + }, + "Knowledge": { + "count": 77, + "num_samples": 1294, + "tasks": [], + "average_score": 0.27361452525243724 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.19633555542091463 + } + } + } +} \ No newline at end of file diff --git a/static/eval_results/SI/POINTS_7B/task_results.json b/static/eval_results/SI/POINTS_7B/task_results.json new file mode 100644 index 0000000000000000000000000000000000000000..b8da2e4943c6b9f525120acfd8b837a93cc11194 --- /dev/null +++ b/static/eval_results/SI/POINTS_7B/task_results.json @@ -0,0 +1,2207 @@ +[ + { + "name": "location_vqa", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "scibench_calculus_wo_solution", + "score": 0.08163265306122448, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49 + }, + { + "name": "logical_reasoning_find_odd_one_out", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "relative_depth_of_different_points", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "signage_navigation", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "road_map_find_highway_between_two_place", + "score": 0.29411764705882354, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "logical_reasoning_fit_pattern", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "entertainment_web_game_style", + "score": 0.75, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "game_platform_support_identification", + "score": 0.10714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ascii_art_understanding", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "scibench_fundamental_wo_solution", + "score": 0.061224489795918366, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49 + }, + { + "name": "GUI_Act_Mobile_swipe", + "score": 0.6019887092978411, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "extract_webpage_headline", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "distinguish_ai_generated_image", + "score": 0.3157894736842105, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "geometry_reasoning_nested_squares", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_cell_recognition", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_image_artifacts_indentification", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_tyreworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "geometry_reasoning_overlapped_circle", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "monthly_weather_days_count", + "score": 0.2619047619047619, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "CLEVRER_physics", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20 + }, + { + "name": "GUI_Act_Mobile_tap", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "character_recognition_in_TV_shows", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "circuit_diagram_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "music_sheet_note_count", + "score": 0.058823529411764705, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "MMMU_physics_chemistry_MCQ", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_find_legal_moves", + "score": 0.024872434514772418, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_visual_storage", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "medical_counting_lymphocytes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "code_execution", + "score": 0.0625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "planning_screenshot_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "chess_sygyzy_endgames", + "score": 0.09455782312925169, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cultural_vqa", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "hashtag_recommendation", + "score": 0.32142857142857145, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_grid", + "score": 0.17857142857142858, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "annoying_word_search", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "healthcare_info_judgement", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "webpage_code_understanding", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "font_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "relative_reflectance_of_different_regions", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "calendar_schedule_suggestion", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mensa_iq_test", + "score": 0.3058823529411765, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "medical_multi_organ_segmentation_rater", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "product_ocr_qa", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "counting", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_blocksworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "MMMU_pro_exam_screenshot", + "score": 0.1717171717171717, + "eval_type": "rule", + "num_demo": 1, + "num_query": 99 + }, + { + "name": "chinese_idiom_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ishihara_test", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "Ad_count_detection", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "number_comparison", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "song_title_identification_from_lyrics", + "score": 0.10714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "paper_vqa", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "polygon_interior_angles", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "play_go_capture_stone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "mindmap_elements_parsing", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_storage", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "rebus", + "score": 0.08695652173913043, + "eval_type": "rule", + "num_demo": 1, + "num_query": 23 + }, + { + "name": "music_sheet_format_QA", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "autonomous_driving_scene_analysis", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "transit_map_intersection_points", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mahjong", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "LaTeX_complex_formula_convertion", + "score": 0.11764705882352941, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "GUI_Act_Web_Single", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "image_translation_en2cn", + "score": 0.016800837996620143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "actor_recognition_in_Movie", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_blood_vessels_recognition", + "score": 0.32142857142857145, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "remaining_playback_time_calculation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "realworld_qa_en2cn", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "soccer_offside", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "recover_masked_word_in_figure", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "top_video_creator_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "web_action_grounding", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "arxiv_vqa", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "interpret_force_perspective_illusion", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "orchestra_score_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "GUI_Act_Web_Multi", + "score": 0.43135590531192486, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "llavaguard", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "icon_arithmetic_puzzle", + "score": 0.17857142857142858, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_count_line_intersections", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "web_action_prediction", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "knowledge_graph_understanding", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "position_relationship", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "Bongard_Problem", + "score": 0.07894736842105263, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "geometry_reasoning_circled_letter", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "flowchart_code_generation", + "score": 0.2222222222222222, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "topological_sort", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_grippers", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "weather_map_climate_type_temperature_parsing", + "score": 0.4642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "highest_discount_game_price_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "media_QA_web_stackoverflow", + "score": 0.6666666666666664, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "newspaper_ocr_in_query_box", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "electricity_plot_future_prediction", + "score": 0.21949473684210527, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "coco_person_detection", + "score": 0.32142857142857145, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_length", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "weather_info_parsing", + "score": 0.6984126984126984, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "physical_property_reasoning", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "face_keypoint_detection", + "score": 0.12450104213939192, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_maxflow", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "geometry_transformation", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_puzzle_single_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "ili_ratio_future_prediction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_analytic", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mvsa_sentiment_classification", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_theory", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "3d_indoor_scene_text_bbox_selection", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "funsd_document_qa", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "single_person_pose_estimation", + "score": 0.18628758352725225, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "deciphering_oracle_bone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "license_plate_recognition", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "science_molecule_chemistry", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "snli_ve_visual_entailment", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "table_understanding", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_solid", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "tqa_textbook_qa", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "music_info_parsing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_area", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "dvqa", + "score": 0.6842105263157895, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "widerface_face_count_and_event_classification", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "long_string_letter_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_winner_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "math_convexity_value_estimation", + "score": 0.27046408475418426, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "ti_fused_vqa_math", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "human_relationship_reasoning", + "score": 0.9375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "vibe_eval_short_phrase", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "question_solution_solving", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "game_info_parsing", + "score": 0.525974025974026, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "top_rated_hotel_identification", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "stock_price_future_prediction", + "score": 0.3725, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "algebra", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_shortest_path_planar", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "map_diagram_qa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_shortest_path_kamada_kawai", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_chordless_cycle", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "waybill_number_sequence_extraction", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "iconqa", + "score": 0.47368421052631576, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "graph_connectivity", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "mnist_pattern", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "egocentric_analysis_single_image", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "image_style_recognition", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "geometry_descriptive", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "electricity_load_estimate_plot", + "score": 0.4362142857142858, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "insect_order_classification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "ti_fused_vqa_biology", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "youtube_video_info_parsing", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "quizlet_question_solving", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ti_fused_vqa_chemistry", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "exchange_rate_estimate_plot", + "score": 0.6044285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multilingual_movie_info_parsing", + "score": 0.3469387755102041, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "hotel_booking_confirmation_parsing", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "code_programming_test_easy", + "score": 0.08333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 24 + }, + { + "name": "brand_logo_recognition_and_elaboration", + "score": 0.16, + "eval_type": "rule", + "num_demo": 1, + "num_query": 25 + }, + { + "name": "movie_info_parsing", + "score": 0.125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "3d_indoor_scene_text_bbox_prediction", + "score": 0.08567382842562732, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "TV_show_info_parsing", + "score": 0.04761904761904762, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "stock_info_parsing", + "score": 0.3529411764705882, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "math_parity", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "famous_building_recognition", + "score": 0.15625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "egocentric_spatial_reasoning", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "nlvr2_two_image_compare_qa", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_isomorphism", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "landmark_recognition_and_qa", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "handwritten_math_expression_extraction", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ti_fused_vqa_physics", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pmc_vqa_medical_image_qa", + "score": 0.42105263157894735, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "long_string_number_recognition", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "electricity_future_prediction_from_table", + "score": 0.7157894736842105, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "places365_scene_type_classification", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "super_clevr", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "animal_pose_estimation", + "score": 0.04747631990100836, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "science_basic_physics", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "kvqa_knowledge_aware_qa", + "score": 0.5263157894736842, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "math_breakpoint", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "newspaper_page_parse_and_count", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "multilingual_game_info_parsing", + "score": 0.20535714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "signboard_identification", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "chart_vqa", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "average_humidity_estimate_plot", + "score": 0.17799999999999994, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "humor_understand_caption_match", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "figureqa", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "coco_object_detection_by_query_property", + "score": 0.39285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "traffic_future_prediction_from_line_plot", + "score": 0.37305263157894736, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "chess_puzzles_crushing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_puzzles_equality", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "chess_puzzles_checkmate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_papers", + "score": 0.03333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "memorization_indian_celebrity", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_chinese_celebrity", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_famous_treaty", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "reward_models_I2T_reward", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "crossword_mini_5x5", + "score": 0.0071428571428571435, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "number_puzzle_sudoku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "number_puzzle_kakuro_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "panel_images_single_question", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "panel_images_multi_question", + "score": 0.6904761904761905, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_complex_question_answering", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_fact_verification", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "constrained_generation_contain_repeat_length", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_contain_length", + "score": 0.8666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_length", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_position_length", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "maze_2d_8x8", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Misinformation_PolitiFact", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Misinformation_GossipCop", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_HatefulMemes", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Memotion", + "score": 0.6117647058823532, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "ocr_table_to_latex", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_csv", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_markdown", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_experience_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_employer_plain", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_article_authors", + "score": 0.32142857142857145, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_article_journal", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_skill_plain", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_text_latex", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_equation", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_html", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_school_plain", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_doodle_guess", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "pictionary_chinese_food_img2en", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_cartoon_drawing_guess", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_genai_output_chinese", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_skribbl_io", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20 + }, + { + "name": "counterfactual_arithmetic", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_youtube", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_iphone_settings", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_notes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_word", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_alipay", + "score": 0.29411764705882354, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "app_layout_understanding_twitter", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_leetcode", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_tiktok", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_excel", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_instagram", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_ppt", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_zoom", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "app_layout_understanding_amazon", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_face_attribute_edit", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_out_of_context", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_background_change", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_text_entity_replace", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_clip_stable_diffusion_generate", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_text_style", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_veracity", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_face_swap", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_blogpost", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_homepage", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_publication", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "autorater_artifact_reason", + "score": 0.4, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "autorater_artifact", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "shape_composition_shapes", + "score": 0.17091836734693877, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "shape_composition_colours", + "score": 0.1685090702947846, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_TheoremQA", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_MATH", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "poetry_petrarchian_sonnet_optional_meter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_shakespearean_sonnet", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_custom_rhyming_scheme", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_haiku", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_acrostic_alliteration", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_limerick", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_acrostic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "symbolic_graphics_programs_computer_aided_design", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "symbolic_graphics_programs_scalable_vector_graphics", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18 + }, + { + "name": "cvbench_adapted_cvbench_depth", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_relation", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_distance", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_count", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_africa", + "score": 0.014285714285714287, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_asia", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_americas", + "score": 0.028571428571428574, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_europe", + "score": 0.014285714285714287, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "figurative_speech_explanation", + "score": 0.46206896551724125, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "tweets_captioning", + "score": 0.1642857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "science_figure_explanation", + "score": 0.34827586206896544, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "iq_test", + "score": 0.22413793103448276, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "image_captioning_with_additional_requirements", + "score": 0.14285714285714285, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "visualization_with_code", + "score": 0.23571428571428574, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocrqa", + "score": 0.4103448275862068, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "table2latex_complex", + "score": 0.34444444444444444, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "graph_interpretation", + "score": 0.23793103448275862, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "defeasible_reasoning", + "score": 0.44482758620689644, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "image_humor_understanding", + "score": 0.3793103448275862, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "bar_chart_interpretation", + "score": 0.4241379310344828, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "humor_explanation", + "score": 0.29999999999999993, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "vibe-eval", + "score": 0.37857142857142856, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "art_explanation", + "score": 0.26551724137931026, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "traffic_accident_analysis", + "score": 0.4857142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "electrocardiogram", + "score": 0.06428571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "GUI_Chat_Easy", + "score": 0.6769230769230771, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26 + }, + { + "name": "funny_image_title", + "score": 0.3142857142857142, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "scibench_w_solution_open_ended", + "score": 0.13199999999999998, + "eval_type": "llm", + "num_demo": 1, + "num_query": 25 + }, + { + "name": "meme_explain", + "score": 0.2928571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "GUI_Chat_Hard", + "score": 0.48387096774193544, + "eval_type": "llm", + "num_demo": 1, + "num_query": 31 + }, + { + "name": "unusual_images", + "score": 0.3206896551724137, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "guess_image_generation_prompt", + "score": 0.7105263157894736, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "docci_image_description_long", + "score": 0.45, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ascii_art_30", + "score": 0.07142857142857142, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Spanish", + "score": 0.0071428571428571435, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_French", + "score": 0.04285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Russian", + "score": 0.0071428571428571435, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_English", + "score": 0.014285714285714287, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Arabic", + "score": 0.014285714285714287, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Japanese", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_fetaqa", + "score": 0.37857142857142856, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bridge_strategies_advanced", + "score": 0.1, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bridge_strategies_expert", + "score": 0.07142857142857142, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bridge_strategies_worldclass", + "score": 0.1, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "red_teaming_captcha", + "score": 0.10000000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "red_teaming_visualmisleading", + "score": 0.836842105263158, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "red_teaming_celebrity", + "score": 0.6900000000000002, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_politics", + "score": 0.6099999999999999, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_jailbreak", + "score": 0.37, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_racial", + "score": 0.625, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + } +] \ No newline at end of file diff --git a/static/eval_results/SI/Phi-3.5-vision/summary_results.json b/static/eval_results/SI/Phi-3.5-vision/summary_results.json new file mode 100644 index 0000000000000000000000000000000000000000..b7443fff17692c8d16a8171ae077403abf95772d --- /dev/null +++ b/static/eval_results/SI/Phi-3.5-vision/summary_results.json @@ -0,0 +1,219 @@ +{ + "model_summary": { + "core": { + "num_eval_tasks": 273, + "num_eval_samples": 4116, + "num_not_eval_samples": 0, + "macro_mean_score": 0.2561274958722834, + "micro_mean_score": 0.2504214576875906, + "missing_tasks": [] + }, + "open": { + "num_eval_tasks": 42, + "num_eval_samples": 813, + "num_not_eval_samples": 0, + "macro_mean_score": 0.4272267419054576, + "micro_mean_score": 0.445879458794588, + "missing_tasks": [] + }, + "overall_score": 0.2789407286767066 + }, + "keyword_stats": { + "skills": { + "Text Recognition (OCR)": { + "count": 101, + "num_samples": 1687, + "tasks": [], + "average_score": 0.2682909697086125 + }, + "Object Recognition and Classification": { + "count": 172, + "num_samples": 2714, + "tasks": [], + "average_score": 0.2845968124529633 + }, + "Scene and Event Understanding": { + "count": 60, + "num_samples": 1004, + "tasks": [], + "average_score": 0.4299430434813172 + }, + "Mathematical and Logical Reasoning": { + "count": 91, + "num_samples": 1630, + "tasks": [], + "average_score": 0.22905610983444738 + }, + "Language Understanding and Generation": { + "count": 102, + "num_samples": 1713, + "tasks": [], + "average_score": 0.3097558922032538 + }, + "Domain-Specific Knowledge and Skills": { + "count": 46, + "num_samples": 897, + "tasks": [], + "average_score": 0.26422404318271525 + }, + "Spatial and Temporal Reasoning": { + "count": 78, + "num_samples": 1273, + "tasks": [], + "average_score": 0.21524515429041854 + }, + "Planning and Decision Making": { + "count": 23, + "num_samples": 356, + "tasks": [], + "average_score": 0.08173397535709728 + }, + "Commonsense and Social Reasoning": { + "count": 38, + "num_samples": 654, + "tasks": [], + "average_score": 0.44526176399160444 + }, + "Ethical and Safety Reasoning": { + "count": 10, + "num_samples": 170, + "tasks": [], + "average_score": 0.6958045112781954 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 67, + "num_samples": 1123, + "tasks": [], + "average_score": 0.18482544209393917 + }, + "Text-Based Images and Documents": { + "count": 53, + "num_samples": 847, + "tasks": [], + "average_score": 0.1852656532829957 + }, + "Photographs": { + "count": 83, + "num_samples": 1315, + "tasks": [], + "average_score": 0.4073649042468842 + }, + "Diagrams and Data Visualizations": { + "count": 88, + "num_samples": 1524, + "tasks": [], + "average_score": 0.2797292831010349 + }, + "Artistic and Creative Content": { + "count": 22, + "num_samples": 389, + "tasks": [], + "average_score": 0.3249099963089235 + }, + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.04423070234557053 + } + }, + "output_format": { + "structured_output": { + "count": 72, + "num_samples": 1121, + "tasks": [], + "average_score": 0.2567782320477167 + }, + "contextual_formatted_text": { + "count": 63, + "num_samples": 975, + "tasks": [], + "average_score": 0.2141318618135909 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.23002523914604356 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.20335546763980886 + }, + "open_ended_output": { + "count": 51, + "num_samples": 991, + "tasks": [], + "average_score": 0.38510487366381607 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.46076785167694245 + } + }, + "input_num": { + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.2789407286767065 + } + }, + "app": { + "Information_Extraction": { + "count": 41, + "num_samples": 644, + "tasks": [], + "average_score": 0.18412184931451608 + }, + "Planning": { + "count": 44, + "num_samples": 714, + "tasks": [], + "average_score": 0.09254779551496593 + }, + "Coding": { + "count": 16, + "num_samples": 244, + "tasks": [], + "average_score": 0.3150531045751634 + }, + "Perception": { + "count": 82, + "num_samples": 1321, + "tasks": [], + "average_score": 0.38360617573843164 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.4142857142857143 + }, + "Science": { + "count": 22, + "num_samples": 469, + "tasks": [], + "average_score": 0.3034971430938622 + }, + "Knowledge": { + "count": 77, + "num_samples": 1294, + "tasks": [], + "average_score": 0.3374902354661273 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.19473774010136682 + } + } + } +} \ No newline at end of file diff --git a/static/eval_results/SI/Phi-3.5-vision/task_results.json b/static/eval_results/SI/Phi-3.5-vision/task_results.json new file mode 100644 index 0000000000000000000000000000000000000000..b3aff65bd67780f32d3dfcac0b21af7487ca36c4 --- /dev/null +++ b/static/eval_results/SI/Phi-3.5-vision/task_results.json @@ -0,0 +1,2207 @@ +[ + { + "name": "location_vqa", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "scibench_calculus_wo_solution", + "score": 0.08163265306122448, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49 + }, + { + "name": "logical_reasoning_find_odd_one_out", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "relative_depth_of_different_points", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "signage_navigation", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "road_map_find_highway_between_two_place", + "score": 0.4117647058823529, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "logical_reasoning_fit_pattern", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "entertainment_web_game_style", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "game_platform_support_identification", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ascii_art_understanding", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "scibench_fundamental_wo_solution", + "score": 0.10204081632653061, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49 + }, + { + "name": "GUI_Act_Mobile_swipe", + "score": 0.12462000961394672, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "extract_webpage_headline", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "distinguish_ai_generated_image", + "score": 0.47368421052631576, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "geometry_reasoning_nested_squares", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_cell_recognition", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_image_artifacts_indentification", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_tyreworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "geometry_reasoning_overlapped_circle", + "score": 0.4642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "monthly_weather_days_count", + "score": 0.11904761904761904, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "CLEVRER_physics", + "score": 0.15, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20 + }, + { + "name": "GUI_Act_Mobile_tap", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "character_recognition_in_TV_shows", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "circuit_diagram_understanding", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "music_sheet_note_count", + "score": 0.058823529411764705, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "MMMU_physics_chemistry_MCQ", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_find_legal_moves", + "score": 0.029751219517657808, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_visual_storage", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "medical_counting_lymphocytes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "code_execution", + "score": 0.0625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "planning_screenshot_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "chess_sygyzy_endgames", + "score": 0.018367346938775512, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cultural_vqa", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "hashtag_recommendation", + "score": 0.6488095238095238, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_grid", + "score": 0.10714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "annoying_word_search", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "healthcare_info_judgement", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "webpage_code_understanding", + "score": 0.7777777777777778, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "font_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "relative_reflectance_of_different_regions", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "calendar_schedule_suggestion", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mensa_iq_test", + "score": 0.15686274509803924, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "medical_multi_organ_segmentation_rater", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "product_ocr_qa", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "counting", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_blocksworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "MMMU_pro_exam_screenshot", + "score": 0.12121212121212122, + "eval_type": "rule", + "num_demo": 1, + "num_query": 99 + }, + { + "name": "chinese_idiom_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ishihara_test", + "score": 0.042857142857142864, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "Ad_count_detection", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "number_comparison", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "song_title_identification_from_lyrics", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "paper_vqa", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "polygon_interior_angles", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "play_go_capture_stone", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "mindmap_elements_parsing", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_storage", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "rebus", + "score": 0.043478260869565216, + "eval_type": "rule", + "num_demo": 1, + "num_query": 23 + }, + { + "name": "music_sheet_format_QA", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "autonomous_driving_scene_analysis", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "transit_map_intersection_points", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mahjong", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "LaTeX_complex_formula_convertion", + "score": 0.23529411764705882, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "GUI_Act_Web_Single", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "image_translation_en2cn", + "score": 0.04535618459802735, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "actor_recognition_in_Movie", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_blood_vessels_recognition", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "remaining_playback_time_calculation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "realworld_qa_en2cn", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "soccer_offside", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "recover_masked_word_in_figure", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "top_video_creator_identification", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "web_action_grounding", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "arxiv_vqa", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "interpret_force_perspective_illusion", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "orchestra_score_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "GUI_Act_Web_Multi", + "score": 0.39285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "llavaguard", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "icon_arithmetic_puzzle", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_count_line_intersections", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "web_action_prediction", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "knowledge_graph_understanding", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "position_relationship", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "Bongard_Problem", + "score": 0.10526315789473684, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "geometry_reasoning_circled_letter", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "flowchart_code_generation", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "topological_sort", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_grippers", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "weather_map_climate_type_temperature_parsing", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "highest_discount_game_price_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "media_QA_web_stackoverflow", + "score": 0.40476190476190477, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "newspaper_ocr_in_query_box", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "electricity_plot_future_prediction", + "score": 0.4628842105263158, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "coco_person_detection", + "score": 0.5246016576544484, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_length", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "weather_info_parsing", + "score": 0.6269841269841271, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "physical_property_reasoning", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "face_keypoint_detection", + "score": 0.7891263675852077, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_maxflow", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "geometry_transformation", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_puzzle_single_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "ili_ratio_future_prediction", + "score": 0.03700000000000008, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_analytic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mvsa_sentiment_classification", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_theory", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "3d_indoor_scene_text_bbox_selection", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "funsd_document_qa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "single_person_pose_estimation", + "score": 0.032668828291194586, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "deciphering_oracle_bone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "license_plate_recognition", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "science_molecule_chemistry", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "snli_ve_visual_entailment", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "table_understanding", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_solid", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "tqa_textbook_qa", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "music_info_parsing", + "score": 0.125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_area", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "dvqa", + "score": 0.6842105263157895, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "widerface_face_count_and_event_classification", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "long_string_letter_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_winner_identification", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "math_convexity_value_estimation", + "score": 0.35448451238579437, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "ti_fused_vqa_math", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "human_relationship_reasoning", + "score": 0.8125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "vibe_eval_short_phrase", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "question_solution_solving", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "game_info_parsing", + "score": 0.5259740259740259, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "top_rated_hotel_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "stock_price_future_prediction", + "score": 0.7468571428571432, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "algebra", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_shortest_path_planar", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "map_diagram_qa", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_shortest_path_kamada_kawai", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_chordless_cycle", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "waybill_number_sequence_extraction", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "iconqa", + "score": 0.2631578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "graph_connectivity", + "score": 0.3, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "mnist_pattern", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "egocentric_analysis_single_image", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "image_style_recognition", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "geometry_descriptive", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "electricity_load_estimate_plot", + "score": 0.5454285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "insect_order_classification", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "ti_fused_vqa_biology", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "youtube_video_info_parsing", + "score": 0.16666666666666669, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "quizlet_question_solving", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ti_fused_vqa_chemistry", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "exchange_rate_estimate_plot", + "score": 0.9523071428571426, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multilingual_movie_info_parsing", + "score": 0.28571428571428564, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "hotel_booking_confirmation_parsing", + "score": 0.13571428571428573, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "code_programming_test_easy", + "score": 0.020833333333333332, + "eval_type": "rule", + "num_demo": 1, + "num_query": 24 + }, + { + "name": "brand_logo_recognition_and_elaboration", + "score": 0.52, + "eval_type": "rule", + "num_demo": 1, + "num_query": 25 + }, + { + "name": "movie_info_parsing", + "score": 0.17857142857142858, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "3d_indoor_scene_text_bbox_prediction", + "score": 0.017032833262569633, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "TV_show_info_parsing", + "score": 0.46825396825396826, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "stock_info_parsing", + "score": 0.6260504201680673, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "math_parity", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "famous_building_recognition", + "score": 0.40625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "egocentric_spatial_reasoning", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "nlvr2_two_image_compare_qa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_isomorphism", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "landmark_recognition_and_qa", + "score": 0.24444444444444446, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "handwritten_math_expression_extraction", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ti_fused_vqa_physics", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pmc_vqa_medical_image_qa", + "score": 0.47368421052631576, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "long_string_number_recognition", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "electricity_future_prediction_from_table", + "score": 0.6657894736842106, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "places365_scene_type_classification", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "super_clevr", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "animal_pose_estimation", + "score": 0.21324372091628846, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "science_basic_physics", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "kvqa_knowledge_aware_qa", + "score": 0.47368421052631576, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "math_breakpoint", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "newspaper_page_parse_and_count", + "score": 0.44444444444444436, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "multilingual_game_info_parsing", + "score": 0.2767857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "signboard_identification", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "chart_vqa", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "average_humidity_estimate_plot", + "score": 0.22333333333333344, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "humor_understand_caption_match", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "figureqa", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "coco_object_detection_by_query_property", + "score": 0.3399232403523023, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "traffic_future_prediction_from_line_plot", + "score": 0.13326315789473686, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "chess_puzzles_crushing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_puzzles_equality", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "chess_puzzles_checkmate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_papers", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "memorization_indian_celebrity", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_chinese_celebrity", + "score": 0.17857142857142858, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_famous_treaty", + "score": 0.39285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "reward_models_I2T_reward", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "crossword_mini_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "number_puzzle_sudoku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "number_puzzle_kakuro_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "panel_images_single_question", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "panel_images_multi_question", + "score": 0.4285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_complex_question_answering", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_fact_verification", + "score": 0.5476190476190477, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "constrained_generation_contain_repeat_length", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_contain_length", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_length", + "score": 0.4, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_position_length", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "maze_2d_8x8", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Misinformation_PolitiFact", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Misinformation_GossipCop", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_HatefulMemes", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Memotion", + "score": 0.6470588235294118, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "ocr_table_to_latex", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_csv", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_markdown", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_experience_plain", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_employer_plain", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_article_authors", + "score": 0.10714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_article_journal", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_skill_plain", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_text_latex", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_equation", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_html", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_school_plain", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_doodle_guess", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "pictionary_chinese_food_img2en", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_cartoon_drawing_guess", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_genai_output_chinese", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_skribbl_io", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20 + }, + { + "name": "counterfactual_arithmetic", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_youtube", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_iphone_settings", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_notes", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_word", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_alipay", + "score": 0.058823529411764705, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "app_layout_understanding_twitter", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_leetcode", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_tiktok", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_excel", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_instagram", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_ppt", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_zoom", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "app_layout_understanding_amazon", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_face_attribute_edit", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_out_of_context", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_background_change", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_text_entity_replace", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_clip_stable_diffusion_generate", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_text_style", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_veracity", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_face_swap", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_blogpost", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_homepage", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_publication", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "autorater_artifact_reason", + "score": 0.6, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "autorater_artifact", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "shape_composition_shapes", + "score": 0.16071428571428573, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "shape_composition_colours", + "score": 0.15972222222222224, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_TheoremQA", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_MATH", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "poetry_petrarchian_sonnet_optional_meter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_shakespearean_sonnet", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_custom_rhyming_scheme", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_haiku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_acrostic_alliteration", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_limerick", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_acrostic", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "symbolic_graphics_programs_computer_aided_design", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "symbolic_graphics_programs_scalable_vector_graphics", + "score": 0.1111111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18 + }, + { + "name": "cvbench_adapted_cvbench_depth", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_relation", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_distance", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_count", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_africa", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_asia", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_americas", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_europe", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "figurative_speech_explanation", + "score": 0.6724137931034485, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "tweets_captioning", + "score": 0.3785714285714285, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "science_figure_explanation", + "score": 0.4517241379310346, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "image_captioning_with_additional_requirements", + "score": 0.7214285714285715, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "visualization_with_code", + "score": 0.34285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table2latex_complex", + "score": 0.3555555555555555, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "graph_interpretation", + "score": 0.39655172413793116, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "defeasible_reasoning", + "score": 0.5275862068965518, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "image_humor_understanding", + "score": 0.7310344827586208, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "bar_chart_interpretation", + "score": 0.4448275862068965, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "humor_explanation", + "score": 0.26666666666666666, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "art_explanation", + "score": 0.2448275862068965, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "traffic_accident_analysis", + "score": 0.5000000000000001, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "electrocardiogram", + "score": 0.27142857142857146, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "GUI_Chat_Easy", + "score": 0.6115384615384616, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26 + }, + { + "name": "funny_image_title", + "score": 0.5857142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "scibench_w_solution_open_ended", + "score": 0.08, + "eval_type": "llm", + "num_demo": 1, + "num_query": 25 + }, + { + "name": "meme_explain", + "score": 0.5857142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "GUI_Chat_Hard", + "score": 0.4064516129032259, + "eval_type": "llm", + "num_demo": 1, + "num_query": 31 + }, + { + "name": "unusual_images", + "score": 0.42758620689655175, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "guess_image_generation_prompt", + "score": 0.7368421052631579, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "docci_image_description_long", + "score": 0.47857142857142854, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ascii_art_30", + "score": 0.07142857142857142, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Spanish", + "score": 0.19999999999999998, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_French", + "score": 0.0071428571428571435, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Russian", + "score": 0.10714285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_English", + "score": 0.0642857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Arabic", + "score": 0.09999999999999999, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Japanese", + "score": 0.09999999999999999, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_fetaqa", + "score": 0.49999999999999994, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bridge_strategies_advanced", + "score": 0.5142857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bridge_strategies_expert", + "score": 0.38571428571428573, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bridge_strategies_worldclass", + "score": 0.21428571428571427, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "red_teaming_captcha", + "score": 0.10000000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "red_teaming_visualmisleading", + "score": 0.8894736842105264, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "red_teaming_celebrity", + "score": 0.89, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "iq_test", + "score": 0.24137931034482757, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "ocrqa", + "score": 0.48620689655172417, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "vibe-eval", + "score": 0.46428571428571436, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "red_teaming_politics", + "score": 0.6749999999999999, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_jailbreak", + "score": 0.915, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_racial", + "score": 0.8, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + } +] \ No newline at end of file diff --git a/static/eval_results/SI/Pixtral_12B/summary_results.json b/static/eval_results/SI/Pixtral_12B/summary_results.json new file mode 100644 index 0000000000000000000000000000000000000000..cf0689e6dbc300e655b0ab20bbcac39388d1c437 --- /dev/null +++ b/static/eval_results/SI/Pixtral_12B/summary_results.json @@ -0,0 +1,219 @@ +{ + "model_summary": { + "core": { + "num_eval_tasks": 273, + "num_eval_samples": 4116, + "num_not_eval_samples": 0, + "macro_mean_score": 0.3436942439614412, + "micro_mean_score": 0.3373564384613738, + "missing_tasks": [] + }, + "open": { + "num_eval_tasks": 42, + "num_eval_samples": 813, + "num_not_eval_samples": 0, + "macro_mean_score": 0.4417271955536318, + "micro_mean_score": 0.4845633456334564, + "missing_tasks": [] + }, + "overall_score": 0.3567653041737333 + }, + "keyword_stats": { + "skills": { + "Text Recognition (OCR)": { + "count": 101, + "num_samples": 1687, + "tasks": [], + "average_score": 0.39551360119171197 + }, + "Object Recognition and Classification": { + "count": 172, + "num_samples": 2714, + "tasks": [], + "average_score": 0.37359181974124417 + }, + "Scene and Event Understanding": { + "count": 60, + "num_samples": 1004, + "tasks": [], + "average_score": 0.4677006268371793 + }, + "Mathematical and Logical Reasoning": { + "count": 91, + "num_samples": 1630, + "tasks": [], + "average_score": 0.3055711926752603 + }, + "Language Understanding and Generation": { + "count": 102, + "num_samples": 1713, + "tasks": [], + "average_score": 0.38842270268832113 + }, + "Domain-Specific Knowledge and Skills": { + "count": 46, + "num_samples": 897, + "tasks": [], + "average_score": 0.35085932465399283 + }, + "Spatial and Temporal Reasoning": { + "count": 78, + "num_samples": 1273, + "tasks": [], + "average_score": 0.28269833721806076 + }, + "Planning and Decision Making": { + "count": 23, + "num_samples": 356, + "tasks": [], + "average_score": 0.08507904212012304 + }, + "Commonsense and Social Reasoning": { + "count": 38, + "num_samples": 654, + "tasks": [], + "average_score": 0.4193828210432134 + }, + "Ethical and Safety Reasoning": { + "count": 10, + "num_samples": 170, + "tasks": [], + "average_score": 0.6302142857142857 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 67, + "num_samples": 1123, + "tasks": [], + "average_score": 0.31669784888602887 + }, + "Text-Based Images and Documents": { + "count": 53, + "num_samples": 847, + "tasks": [], + "average_score": 0.2688429906381188 + }, + "Photographs": { + "count": 83, + "num_samples": 1315, + "tasks": [], + "average_score": 0.4327891810625066 + }, + "Diagrams and Data Visualizations": { + "count": 88, + "num_samples": 1524, + "tasks": [], + "average_score": 0.36461586731895695 + }, + "Artistic and Creative Content": { + "count": 22, + "num_samples": 389, + "tasks": [], + "average_score": 0.3947713702430871 + }, + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.11048396896880823 + } + }, + "output_format": { + "structured_output": { + "count": 72, + "num_samples": 1121, + "tasks": [], + "average_score": 0.36511340930610364 + }, + "contextual_formatted_text": { + "count": 63, + "num_samples": 975, + "tasks": [], + "average_score": 0.3161209026018942 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.29510067482559116 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.3135393276021012 + }, + "open_ended_output": { + "count": 51, + "num_samples": 991, + "tasks": [], + "average_score": 0.3995518703501119 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.5076172985263894 + } + }, + "input_num": { + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.3567653041737331 + } + }, + "app": { + "Information_Extraction": { + "count": 41, + "num_samples": 644, + "tasks": [], + "average_score": 0.4143415072482432 + }, + "Planning": { + "count": 44, + "num_samples": 714, + "tasks": [], + "average_score": 0.122839038193565 + }, + "Coding": { + "count": 16, + "num_samples": 244, + "tasks": [], + "average_score": 0.3689221521942111 + }, + "Perception": { + "count": 82, + "num_samples": 1321, + "tasks": [], + "average_score": 0.46377210154054166 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.3444444444444444 + }, + "Science": { + "count": 22, + "num_samples": 469, + "tasks": [], + "average_score": 0.35876745089800455 + }, + "Knowledge": { + "count": 77, + "num_samples": 1294, + "tasks": [], + "average_score": 0.37374171749764634 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.27839183583970506 + } + } + } +} \ No newline at end of file diff --git a/static/eval_results/SI/Pixtral_12B/task_results.json b/static/eval_results/SI/Pixtral_12B/task_results.json new file mode 100644 index 0000000000000000000000000000000000000000..f6db51d07c023f8f5a8a4b56fcad623fa9902f0e --- /dev/null +++ b/static/eval_results/SI/Pixtral_12B/task_results.json @@ -0,0 +1,2207 @@ +[ + { + "name": "science_molecule_chemistry", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "signboard_identification", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "funsd_document_qa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "physical_property_reasoning", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_area", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_analytic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "3d_indoor_scene_text_bbox_selection", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "map_diagram_qa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_connectivity", + "score": 0.21666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "graph_isomorphism", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "question_solution_solving", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "snli_ve_visual_entailment", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "ti_fused_vqa_chemistry", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ili_ratio_future_prediction", + "score": 0.01728571428571422, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "image_style_recognition", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "multilingual_movie_info_parsing", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_transformation", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "vibe_eval_short_phrase", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "math_breakpoint", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "waybill_number_sequence_extraction", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "brand_logo_recognition_and_elaboration", + "score": 0.74, + "eval_type": "rule", + "num_demo": 1, + "num_query": 25 + }, + { + "name": "license_plate_recognition", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_puzzle_single_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "code_programming_test_easy", + "score": 0.20833333333333334, + "eval_type": "rule", + "num_demo": 1, + "num_query": 24 + }, + { + "name": "chess_winner_identification", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "chart_vqa", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "math_convexity_value_estimation", + "score": 0.3819734783418875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "3d_indoor_scene_text_bbox_prediction", + "score": 0.006682223651902177, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "long_string_letter_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "dvqa", + "score": 0.9473684210526315, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "egocentric_spatial_reasoning", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "quizlet_question_solving", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "coco_person_detection", + "score": 0.5101908757577766, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_maxflow", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "tqa_textbook_qa", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "game_info_parsing", + "score": 0.8961038961038961, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "animal_pose_estimation", + "score": 0.22936526304647478, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "humor_understand_caption_match", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "table_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "super_clevr", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "average_humidity_estimate_plot", + "score": 0.636, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "nlvr2_two_image_compare_qa", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "face_keypoint_detection", + "score": 0.7556321946743035, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "insect_order_classification", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "deciphering_oracle_bone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "places365_scene_type_classification", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "science_basic_physics", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "long_string_number_recognition", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "handwritten_math_expression_extraction", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "newspaper_ocr_in_query_box", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "graph_theory", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "landmark_recognition_and_qa", + "score": 0.4888888888888888, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "geometry_solid", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "top_rated_hotel_identification", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "stock_info_parsing", + "score": 0.9369747899159664, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pmc_vqa_medical_image_qa", + "score": 0.5263157894736842, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "electricity_future_prediction_from_table", + "score": 0.7302105263157892, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "TV_show_info_parsing", + "score": 0.7460317460317462, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "coco_object_detection_by_query_property", + "score": 0.5540115257336367, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "figureqa", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ti_fused_vqa_biology", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "traffic_future_prediction_from_line_plot", + "score": 0.41168421052631576, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "mvsa_sentiment_classification", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "stock_price_future_prediction", + "score": 0.6427142857142858, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "kvqa_knowledge_aware_qa", + "score": 0.3684210526315789, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "hotel_booking_confirmation_parsing", + "score": 0.5071428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_shortest_path_planar", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "math_parity", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "famous_building_recognition", + "score": 0.8125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "multilingual_game_info_parsing", + "score": 0.48214285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "weather_info_parsing", + "score": 0.7896825396825398, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_shortest_path_kamada_kawai", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "widerface_face_count_and_event_classification", + "score": 0.39285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ti_fused_vqa_physics", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_chordless_cycle", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_length", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "algebra", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "human_relationship_reasoning", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "exchange_rate_estimate_plot", + "score": 0.9698285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "electricity_load_estimate_plot", + "score": 0.5464285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "single_person_pose_estimation", + "score": 0.15356401612245238, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ti_fused_vqa_math", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "electricity_plot_future_prediction", + "score": 0.48218947368421045, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "iconqa", + "score": 0.3684210526315789, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "movie_info_parsing", + "score": 0.6160714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "youtube_video_info_parsing", + "score": 0.5357142857142858, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "egocentric_analysis_single_image", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "music_info_parsing", + "score": 0.375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mnist_pattern", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "newspaper_page_parse_and_count", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "geometry_descriptive", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_relation", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_depth", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_distance", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_count", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "symbolic_graphics_programs_computer_aided_design", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "symbolic_graphics_programs_scalable_vector_graphics", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18 + }, + { + "name": "multiple_states_identify_asia", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_americas", + "score": 0.28571428571428575, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_europe", + "score": 0.20000000000000004, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_africa", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "flowchart_code_generation", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "healthcare_info_judgement", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "relative_depth_of_different_points", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_visual_storage", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "Ad_count_detection", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_sygyzy_endgames", + "score": 0.05714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "web_action_grounding", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "position_relationship", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "GUI_Act_Web_Multi", + "score": 0.2703724018737771, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "webpage_code_understanding", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "extract_webpage_headline", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "music_sheet_note_count", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "geometry_reasoning_circled_letter", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "logical_reasoning_find_odd_one_out", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "paper_vqa", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "remaining_playback_time_calculation", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "interpret_force_perspective_illusion", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "GUI_Act_Web_Single", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "hashtag_recommendation", + "score": 0.9047619047619048, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_tyreworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "weather_map_climate_type_temperature_parsing", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "scibench_calculus_wo_solution", + "score": 0.16326530612244897, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49 + }, + { + "name": "MMMU_physics_chemistry_MCQ", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "image_translation_en2cn", + "score": 0.3052125197255065, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "game_platform_support_identification", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "music_sheet_format_QA", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cultural_vqa", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "CLEVRER_physics", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20 + }, + { + "name": "location_vqa", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_nested_squares", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "circuit_diagram_understanding", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "scibench_fundamental_wo_solution", + "score": 0.04081632653061224, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49 + }, + { + "name": "medical_cell_recognition", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "planning_screenshot_blocksworld", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "planning_screenshot_storage", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "distinguish_ai_generated_image", + "score": 0.5789473684210527, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "geometry_reasoning_count_line_intersections", + "score": 0.10714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_grippers", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "llavaguard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mindmap_elements_parsing", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "autonomous_driving_scene_analysis", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "realworld_qa_en2cn", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "orchestra_score_recognition", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "highest_discount_game_price_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "counting", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "media_QA_web_stackoverflow", + "score": 0.619047619047619, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "soccer_offside", + "score": 0.2222222222222222, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "song_title_identification_from_lyrics", + "score": 0.32142857142857145, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMMU_pro_exam_screenshot", + "score": 0.23232323232323232, + "eval_type": "rule", + "num_demo": 1, + "num_query": 99 + }, + { + "name": "medical_multi_organ_segmentation_rater", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "knowledge_graph_understanding", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "mensa_iq_test", + "score": 0.3980392156862745, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "play_go_capture_stone", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "medical_blood_vessels_recognition", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "web_action_prediction", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "arxiv_vqa", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "top_video_creator_identification", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "annoying_word_search", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "GUI_Act_Mobile_tap", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_grid", + "score": 0.4642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "GUI_Act_Mobile_swipe", + "score": 0.48256296698343887, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "recover_masked_word_in_figure", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "character_recognition_in_TV_shows", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "relative_reflectance_of_different_regions", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_image_artifacts_indentification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "signage_navigation", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "transit_map_intersection_points", + "score": 0.23214285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "code_execution", + "score": 0.125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "icon_arithmetic_puzzle", + "score": 0.6785714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_overlapped_circle", + "score": 0.6785714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chinese_idiom_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "monthly_weather_days_count", + "score": 0.3095238095238095, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "calendar_schedule_suggestion", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ascii_art_understanding", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_find_legal_moves", + "score": 0.018168314191328267, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "topological_sort", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "entertainment_web_game_style", + "score": 0.8214285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "polygon_interior_angles", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mahjong", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "font_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "number_comparison", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_counting_lymphocytes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "actor_recognition_in_Movie", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "product_ocr_qa", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "Bongard_Problem", + "score": 0.23684210526315788, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "logical_reasoning_fit_pattern", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "road_map_find_highway_between_two_place", + "score": 0.4117647058823529, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "ishihara_test", + "score": 0.17142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "LaTeX_complex_formula_convertion", + "score": 0.23529411764705882, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "rebus", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 23 + }, + { + "name": "constrained_generation_contain_position_length", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_repeat_length", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_contain_length", + "score": 0.8, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_length", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "chess_puzzles_equality", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "chess_puzzles_crushing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_puzzles_checkmate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "panel_images_single_question", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "panel_images_multi_question", + "score": 0.619047619047619, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "crossword_mini_5x5", + "score": 0.08571428571428573, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_HatefulMemes", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Misinformation_GossipCop", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Misinformation_PolitiFact", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Memotion", + "score": 0.6470588235294118, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "reward_models_I2T_reward", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_MATH", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "ocr_math_TheoremQA", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "counterfactual_arithmetic", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_homepage", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_publication", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_blogpost", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_complex_question_answering", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_fact_verification", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_face_swap", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_text_style", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_clip_stable_diffusion_generate", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_veracity", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_face_attribute_edit", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_text_entity_replace", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_out_of_context", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_background_change", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "number_puzzle_kakuro_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "number_puzzle_sudoku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "maze_2d_8x8", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "shape_composition_shapes", + "score": 0.2057823129251701, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "shape_composition_colours", + "score": 0.2205215419501134, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_chinese_celebrity", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_indian_celebrity", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_papers", + "score": 0.3, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "memorization_famous_treaty", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_csv", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_school_plain", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_text_latex", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_skill_plain", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_experience_plain", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_article_authors", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_markdown", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_html", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_employer_plain", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_equation", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_article_journal", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_latex", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "autorater_artifact", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "autorater_artifact_reason", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_haiku", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_acrostic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_limerick", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_petrarchian_sonnet_optional_meter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_custom_rhyming_scheme", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_acrostic_alliteration", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_shakespearean_sonnet", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "pictionary_cartoon_drawing_guess", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_doodle_guess", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "pictionary_skribbl_io", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20 + }, + { + "name": "pictionary_genai_output_chinese", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_chinese_food_img2en", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_tiktok", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_notes", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_alipay", + "score": 0.47058823529411764, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "app_layout_understanding_amazon", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_instagram", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_ppt", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_leetcode", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_twitter", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_zoom", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "app_layout_understanding_iphone_settings", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_youtube", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_excel", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_word", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "scibench_w_solution_open_ended", + "score": 0.22200000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 25 + }, + { + "name": "electrocardiogram", + "score": 0.32857142857142846, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "funny_image_title", + "score": 0.5999999999999999, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "image_captioning_with_additional_requirements", + "score": 0.8500000000000002, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "meme_explain", + "score": 0.2785714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "image_humor_understanding", + "score": 0.7379310344827585, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "figurative_speech_explanation", + "score": 0.6827586206896552, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "tweets_captioning", + "score": 0.4928571428571429, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "science_figure_explanation", + "score": 0.5827586206896551, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "iq_test", + "score": 0.29655172413793096, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "visualization_with_code", + "score": 0.40714285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocrqa", + "score": 0.7689655172413793, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "table2latex_complex", + "score": 0.4111111111111112, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "graph_interpretation", + "score": 0.7379310344827585, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "defeasible_reasoning", + "score": 0.6896551724137933, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "bar_chart_interpretation", + "score": 0.4586206896551724, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "humor_explanation", + "score": 0.3933333333333333, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "vibe-eval", + "score": 0.47857142857142854, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "art_explanation", + "score": 0.4827586206896552, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "traffic_accident_analysis", + "score": 0.2, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "GUI_Chat_Easy", + "score": 0.7461538461538463, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26 + }, + { + "name": "GUI_Chat_Hard", + "score": 0.46451612903225814, + "eval_type": "llm", + "num_demo": 1, + "num_query": 31 + }, + { + "name": "unusual_images", + "score": 0.6448275862068966, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "guess_image_generation_prompt", + "score": 0.8105263157894738, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "docci_image_description_long", + "score": 0.6642857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ascii_art_30", + "score": 0.35714285714285715, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Spanish", + "score": 0.09285714285714287, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_French", + "score": 0.014285714285714287, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Russian", + "score": 0.0071428571428571435, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_English", + "score": 0.028571428571428574, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Arabic", + "score": 0.0071428571428571435, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Japanese", + "score": 0.028571428571428574, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_fetaqa", + "score": 0.4142857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bridge_strategies_advanced", + "score": 0.10714285714285718, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bridge_strategies_expert", + "score": 0.2571428571428572, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bridge_strategies_worldclass", + "score": 0.09285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "red_teaming_captcha", + "score": 0.10000000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "red_teaming_visualmisleading", + "score": 0.9, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "red_teaming_celebrity", + "score": 0.72, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_politics", + "score": 0.69, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_jailbreak", + "score": 0.54, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_racial", + "score": 0.7649999999999999, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + } +] \ No newline at end of file diff --git a/static/eval_results/SI/Qwen2_VL_2B/summary_results.json b/static/eval_results/SI/Qwen2_VL_2B/summary_results.json new file mode 100644 index 0000000000000000000000000000000000000000..9b971e81cae22201b809b20a03940d5a8fa91adb --- /dev/null +++ b/static/eval_results/SI/Qwen2_VL_2B/summary_results.json @@ -0,0 +1,219 @@ +{ + "model_summary": { + "core": { + "num_eval_tasks": 273, + "num_eval_samples": 4116, + "num_not_eval_samples": 0, + "macro_mean_score": 0.22787906973244856, + "micro_mean_score": 0.2234748515064842, + "missing_tasks": [] + }, + "open": { + "num_eval_tasks": 42, + "num_eval_samples": 813, + "num_not_eval_samples": 0, + "macro_mean_score": 0.3509364634962041, + "micro_mean_score": 0.3768757687576875, + "missing_tasks": [] + }, + "overall_score": 0.24428672223428263 + }, + "keyword_stats": { + "skills": { + "Text Recognition (OCR)": { + "count": 101, + "num_samples": 1687, + "tasks": [], + "average_score": 0.2253353309586889 + }, + "Object Recognition and Classification": { + "count": 172, + "num_samples": 2714, + "tasks": [], + "average_score": 0.25965511679594977 + }, + "Scene and Event Understanding": { + "count": 60, + "num_samples": 1004, + "tasks": [], + "average_score": 0.3778480095314066 + }, + "Mathematical and Logical Reasoning": { + "count": 91, + "num_samples": 1630, + "tasks": [], + "average_score": 0.19211647307230917 + }, + "Language Understanding and Generation": { + "count": 102, + "num_samples": 1713, + "tasks": [], + "average_score": 0.27091980735233906 + }, + "Domain-Specific Knowledge and Skills": { + "count": 46, + "num_samples": 897, + "tasks": [], + "average_score": 0.21906286524745977 + }, + "Spatial and Temporal Reasoning": { + "count": 78, + "num_samples": 1273, + "tasks": [], + "average_score": 0.19305913502727232 + }, + "Planning and Decision Making": { + "count": 23, + "num_samples": 356, + "tasks": [], + "average_score": 0.07432337143230854 + }, + "Commonsense and Social Reasoning": { + "count": 38, + "num_samples": 654, + "tasks": [], + "average_score": 0.37769658880841467 + }, + "Ethical and Safety Reasoning": { + "count": 10, + "num_samples": 170, + "tasks": [], + "average_score": 0.5887067669172933 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 67, + "num_samples": 1123, + "tasks": [], + "average_score": 0.1930642044577058 + }, + "Text-Based Images and Documents": { + "count": 53, + "num_samples": 847, + "tasks": [], + "average_score": 0.13312812081322709 + }, + "Photographs": { + "count": 83, + "num_samples": 1315, + "tasks": [], + "average_score": 0.36205043973893236 + }, + "Diagrams and Data Visualizations": { + "count": 88, + "num_samples": 1524, + "tasks": [], + "average_score": 0.23259922343062173 + }, + "Artistic and Creative Content": { + "count": 22, + "num_samples": 389, + "tasks": [], + "average_score": 0.2842921728720087 + }, + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.09293971931071163 + } + }, + "output_format": { + "structured_output": { + "count": 72, + "num_samples": 1121, + "tasks": [], + "average_score": 0.23210528191388644 + }, + "contextual_formatted_text": { + "count": 63, + "num_samples": 975, + "tasks": [], + "average_score": 0.1652854805017628 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.17061075451792151 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.23904036592289388 + }, + "open_ended_output": { + "count": 51, + "num_samples": 991, + "tasks": [], + "average_score": 0.3296071840681468 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.42328479601206864 + } + }, + "input_num": { + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.24428672223428244 + } + }, + "app": { + "Information_Extraction": { + "count": 41, + "num_samples": 644, + "tasks": [], + "average_score": 0.21948696002702636 + }, + "Planning": { + "count": 44, + "num_samples": 714, + "tasks": [], + "average_score": 0.08656714113327156 + }, + "Coding": { + "count": 16, + "num_samples": 244, + "tasks": [], + "average_score": 0.18075323879551822 + }, + "Perception": { + "count": 82, + "num_samples": 1321, + "tasks": [], + "average_score": 0.33781829803679747 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.36984126984126986 + }, + "Science": { + "count": 22, + "num_samples": 469, + "tasks": [], + "average_score": 0.2448949597527861 + }, + "Knowledge": { + "count": 77, + "num_samples": 1294, + "tasks": [], + "average_score": 0.28841305815072016 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.16147424237969243 + } + } + } +} \ No newline at end of file diff --git a/static/eval_results/SI/Qwen2_VL_2B/task_results.json b/static/eval_results/SI/Qwen2_VL_2B/task_results.json new file mode 100644 index 0000000000000000000000000000000000000000..e11e8fd8508449a25de25e2575f1b61f16033db5 --- /dev/null +++ b/static/eval_results/SI/Qwen2_VL_2B/task_results.json @@ -0,0 +1,2207 @@ +[ + { + "name": "location_vqa", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "scibench_calculus_wo_solution", + "score": 0.02040816326530612, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49 + }, + { + "name": "logical_reasoning_find_odd_one_out", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "relative_depth_of_different_points", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "signage_navigation", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "road_map_find_highway_between_two_place", + "score": 0.23529411764705882, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "logical_reasoning_fit_pattern", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "entertainment_web_game_style", + "score": 0.6071428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "game_platform_support_identification", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ascii_art_understanding", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "scibench_fundamental_wo_solution", + "score": 0.04081632653061224, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49 + }, + { + "name": "GUI_Act_Mobile_swipe", + "score": 0.5018438475034174, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "extract_webpage_headline", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "distinguish_ai_generated_image", + "score": 0.6842105263157895, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "geometry_reasoning_nested_squares", + "score": 0.10714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_cell_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_image_artifacts_indentification", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_tyreworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "geometry_reasoning_overlapped_circle", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "monthly_weather_days_count", + "score": 0.2380952380952381, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "CLEVRER_physics", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20 + }, + { + "name": "GUI_Act_Mobile_tap", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "character_recognition_in_TV_shows", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "circuit_diagram_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "music_sheet_note_count", + "score": 0.058823529411764705, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "MMMU_physics_chemistry_MCQ", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_find_legal_moves", + "score": 0.03287939197004419, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_visual_storage", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "medical_counting_lymphocytes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "code_execution", + "score": 0.0625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "planning_screenshot_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "chess_sygyzy_endgames", + "score": 0.10884353741496598, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cultural_vqa", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "hashtag_recommendation", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_grid", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "annoying_word_search", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "healthcare_info_judgement", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "webpage_code_understanding", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "font_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "relative_reflectance_of_different_regions", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "calendar_schedule_suggestion", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mensa_iq_test", + "score": 0.22794117647058823, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "medical_multi_organ_segmentation_rater", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "product_ocr_qa", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "counting", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_blocksworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "MMMU_pro_exam_screenshot", + "score": 0.1414141414141414, + "eval_type": "rule", + "num_demo": 1, + "num_query": 99 + }, + { + "name": "chinese_idiom_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ishihara_test", + "score": 0.21428571428571433, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "Ad_count_detection", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "number_comparison", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "song_title_identification_from_lyrics", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "paper_vqa", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "polygon_interior_angles", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "play_go_capture_stone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "mindmap_elements_parsing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_storage", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "rebus", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 23 + }, + { + "name": "music_sheet_format_QA", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "autonomous_driving_scene_analysis", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "transit_map_intersection_points", + "score": 0.10416666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mahjong", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "LaTeX_complex_formula_convertion", + "score": 0.11764705882352941, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "GUI_Act_Web_Single", + "score": 0.07779907911988974, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "image_translation_en2cn", + "score": 0.044674076957756666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "actor_recognition_in_Movie", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_blood_vessels_recognition", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "remaining_playback_time_calculation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "realworld_qa_en2cn", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "soccer_offside", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "recover_masked_word_in_figure", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "top_video_creator_identification", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "web_action_grounding", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "arxiv_vqa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "interpret_force_perspective_illusion", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "orchestra_score_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "GUI_Act_Web_Multi", + "score": 0.4499764488395411, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "llavaguard", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "icon_arithmetic_puzzle", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_count_line_intersections", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "web_action_prediction", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "knowledge_graph_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "position_relationship", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "Bongard_Problem", + "score": 0.12280701754385963, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "geometry_reasoning_circled_letter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "flowchart_code_generation", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "topological_sort", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_grippers", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "weather_map_climate_type_temperature_parsing", + "score": 0.4642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "highest_discount_game_price_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "media_QA_web_stackoverflow", + "score": 0.16666666666666669, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "newspaper_ocr_in_query_box", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "electricity_plot_future_prediction", + "score": 0.36160000000000003, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "coco_person_detection", + "score": 0.5182194949333015, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_length", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "weather_info_parsing", + "score": 0.3571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "physical_property_reasoning", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "face_keypoint_detection", + "score": 0.848854419078294, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_maxflow", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "geometry_transformation", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_puzzle_single_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "ili_ratio_future_prediction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_analytic", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mvsa_sentiment_classification", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_theory", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "3d_indoor_scene_text_bbox_selection", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "funsd_document_qa", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "single_person_pose_estimation", + "score": 0.058861588605067065, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "deciphering_oracle_bone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "license_plate_recognition", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "science_molecule_chemistry", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "snli_ve_visual_entailment", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "table_understanding", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_solid", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "tqa_textbook_qa", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "music_info_parsing", + "score": 0.36607142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_area", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "dvqa", + "score": 0.631578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "widerface_face_count_and_event_classification", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "long_string_letter_recognition", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_winner_identification", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "math_convexity_value_estimation", + "score": 0.18447073719814694, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "ti_fused_vqa_math", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "human_relationship_reasoning", + "score": 0.625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "vibe_eval_short_phrase", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "question_solution_solving", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "game_info_parsing", + "score": 0.6623376623376623, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "top_rated_hotel_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "stock_price_future_prediction", + "score": 0.12299999999999996, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "algebra", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_shortest_path_planar", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "map_diagram_qa", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_shortest_path_kamada_kawai", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_chordless_cycle", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "waybill_number_sequence_extraction", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "iconqa", + "score": 0.2631578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "graph_connectivity", + "score": 0.11666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "mnist_pattern", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "egocentric_analysis_single_image", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "image_style_recognition", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "geometry_descriptive", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "electricity_load_estimate_plot", + "score": 0.37485714285714283, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "insect_order_classification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "ti_fused_vqa_biology", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "youtube_video_info_parsing", + "score": 0.38095238095238093, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "quizlet_question_solving", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ti_fused_vqa_chemistry", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "exchange_rate_estimate_plot", + "score": 0.8766142857142858, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multilingual_movie_info_parsing", + "score": 0.23469387755102034, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "hotel_booking_confirmation_parsing", + "score": 0.19285714285714284, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "code_programming_test_easy", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 24 + }, + { + "name": "brand_logo_recognition_and_elaboration", + "score": 0.66, + "eval_type": "rule", + "num_demo": 1, + "num_query": 25 + }, + { + "name": "movie_info_parsing", + "score": 0.5178571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "3d_indoor_scene_text_bbox_prediction", + "score": 0.043022295764280405, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "TV_show_info_parsing", + "score": 0.5238095238095238, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "stock_info_parsing", + "score": 0.7058823529411764, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "math_parity", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "famous_building_recognition", + "score": 0.40625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "egocentric_spatial_reasoning", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "nlvr2_two_image_compare_qa", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_isomorphism", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "landmark_recognition_and_qa", + "score": 0.5111111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "handwritten_math_expression_extraction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ti_fused_vqa_physics", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pmc_vqa_medical_image_qa", + "score": 0.5263157894736842, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "long_string_number_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "electricity_future_prediction_from_table", + "score": 0.5589473684210526, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "places365_scene_type_classification", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "super_clevr", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "animal_pose_estimation", + "score": 0.1887112140198952, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "science_basic_physics", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "kvqa_knowledge_aware_qa", + "score": 0.15789473684210525, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "math_breakpoint", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "newspaper_page_parse_and_count", + "score": 0.17777777777777776, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "multilingual_game_info_parsing", + "score": 0.4017857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "signboard_identification", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "chart_vqa", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "average_humidity_estimate_plot", + "score": 0.46666666666666673, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "humor_understand_caption_match", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "figureqa", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "coco_object_detection_by_query_property", + "score": 0.35158243862450156, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "traffic_future_prediction_from_line_plot", + "score": 0.513578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "chess_puzzles_crushing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_puzzles_equality", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "chess_puzzles_checkmate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_papers", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "memorization_indian_celebrity", + "score": 0.17857142857142858, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_chinese_celebrity", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_famous_treaty", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "reward_models_I2T_reward", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "crossword_mini_5x5", + "score": 0.0071428571428571435, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "number_puzzle_sudoku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "number_puzzle_kakuro_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "panel_images_single_question", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "panel_images_multi_question", + "score": 0.380952380952381, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_complex_question_answering", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_fact_verification", + "score": 0.4761904761904762, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "constrained_generation_contain_repeat_length", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_contain_length", + "score": 1.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_length", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_position_length", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "maze_2d_8x8", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Misinformation_PolitiFact", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Misinformation_GossipCop", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_HatefulMemes", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Memotion", + "score": 0.7058823529411766, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "ocr_table_to_latex", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_csv", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_markdown", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_experience_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_employer_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_article_authors", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_article_journal", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_skill_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_text_latex", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_equation", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_html", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_school_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_doodle_guess", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "pictionary_chinese_food_img2en", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_cartoon_drawing_guess", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_genai_output_chinese", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_skribbl_io", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20 + }, + { + "name": "counterfactual_arithmetic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_youtube", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_iphone_settings", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_notes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_word", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_alipay", + "score": 0.058823529411764705, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "app_layout_understanding_twitter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_leetcode", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_tiktok", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_excel", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_instagram", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_ppt", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_zoom", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "app_layout_understanding_amazon", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_face_attribute_edit", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_out_of_context", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_background_change", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_text_entity_replace", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_clip_stable_diffusion_generate", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_text_style", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_veracity", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_face_swap", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_blogpost", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_homepage", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_publication", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "autorater_artifact_reason", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "autorater_artifact", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "shape_composition_shapes", + "score": 0.18579931972789115, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "shape_composition_colours", + "score": 0.1149092970521542, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_TheoremQA", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_MATH", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "poetry_petrarchian_sonnet_optional_meter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_shakespearean_sonnet", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_custom_rhyming_scheme", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_haiku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_acrostic_alliteration", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_limerick", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_acrostic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "symbolic_graphics_programs_computer_aided_design", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "symbolic_graphics_programs_scalable_vector_graphics", + "score": 0.05555555555555555, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18 + }, + { + "name": "cvbench_adapted_cvbench_depth", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_relation", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_distance", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_count", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_africa", + "score": 0.014285714285714287, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_asia", + "score": 0.014285714285714287, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_americas", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_europe", + "score": 0.014285714285714287, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "figurative_speech_explanation", + "score": 0.4068965517241379, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "tweets_captioning", + "score": 0.3071428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "science_figure_explanation", + "score": 0.39310344827586197, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "iq_test", + "score": 0.23448275862068968, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "image_captioning_with_additional_requirements", + "score": 0.4928571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "visualization_with_code", + "score": 0.12142857142857146, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocrqa", + "score": 0.5379310344827586, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "table2latex_complex", + "score": 0.12222222222222223, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "graph_interpretation", + "score": 0.3310344827586207, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "defeasible_reasoning", + "score": 0.5034482758620689, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "image_humor_understanding", + "score": 0.506896551724138, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "bar_chart_interpretation", + "score": 0.296551724137931, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "humor_explanation", + "score": 0.5533333333333333, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "vibe-eval", + "score": 0.5142857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "art_explanation", + "score": 0.5206896551724136, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "traffic_accident_analysis", + "score": 0.0642857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "electrocardiogram", + "score": 0.29285714285714287, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "GUI_Chat_Easy", + "score": 0.5692307692307693, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26 + }, + { + "name": "funny_image_title", + "score": 0.2285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "scibench_w_solution_open_ended", + "score": 0.07200000000000001, + "eval_type": "llm", + "num_demo": 1, + "num_query": 25 + }, + { + "name": "meme_explain", + "score": 0.2285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "GUI_Chat_Hard", + "score": 0.5193548387096775, + "eval_type": "llm", + "num_demo": 1, + "num_query": 31 + }, + { + "name": "unusual_images", + "score": 0.32068965517241377, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "guess_image_generation_prompt", + "score": 0.7052631578947368, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "docci_image_description_long", + "score": 0.5499999999999999, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ascii_art_30", + "score": 0.2857142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Spanish", + "score": 0.17142857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_French", + "score": 0.11428571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Russian", + "score": 0.05714285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_English", + "score": 0.23571428571428574, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Arabic", + "score": 0.04285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Japanese", + "score": 0.10714285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_fetaqa", + "score": 0.4428571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bridge_strategies_advanced", + "score": 0.1142857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bridge_strategies_expert", + "score": 0.2571428571428572, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bridge_strategies_worldclass", + "score": 0.09999999999999999, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "red_teaming_captcha", + "score": 0.26842105263157895, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "red_teaming_visualmisleading", + "score": 0.8842105263157897, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "red_teaming_celebrity", + "score": 0.7050000000000002, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_politics", + "score": 0.62, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_jailbreak", + "score": 0.31499999999999995, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_racial", + "score": 0.6250000000000001, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + } +] \ No newline at end of file diff --git a/static/eval_results/SI/Qwen2_VL_72B/summary_results.json b/static/eval_results/SI/Qwen2_VL_72B/summary_results.json new file mode 100644 index 0000000000000000000000000000000000000000..ede9d54993b54c73ddf7fd14fa46ff74244d04e5 --- /dev/null +++ b/static/eval_results/SI/Qwen2_VL_72B/summary_results.json @@ -0,0 +1,219 @@ +{ + "model_summary": { + "core": { + "num_eval_tasks": 273, + "num_eval_samples": 4116, + "num_not_eval_samples": 0, + "macro_mean_score": 0.4730536307784527, + "micro_mean_score": 0.4659830915476831, + "missing_tasks": [] + }, + "open": { + "num_eval_tasks": 42, + "num_eval_samples": 813, + "num_not_eval_samples": 0, + "macro_mean_score": 0.5510079982505317, + "micro_mean_score": 0.5826568265682657, + "missing_tasks": [] + }, + "overall_score": 0.48344754644139654 + }, + "keyword_stats": { + "skills": { + "Text Recognition (OCR)": { + "count": 101, + "num_samples": 1687, + "tasks": [], + "average_score": 0.5688395686544739 + }, + "Object Recognition and Classification": { + "count": 172, + "num_samples": 2714, + "tasks": [], + "average_score": 0.49559260360544427 + }, + "Scene and Event Understanding": { + "count": 60, + "num_samples": 1004, + "tasks": [], + "average_score": 0.6040487985710314 + }, + "Mathematical and Logical Reasoning": { + "count": 91, + "num_samples": 1630, + "tasks": [], + "average_score": 0.40095954140813556 + }, + "Language Understanding and Generation": { + "count": 102, + "num_samples": 1713, + "tasks": [], + "average_score": 0.5387802130987105 + }, + "Domain-Specific Knowledge and Skills": { + "count": 46, + "num_samples": 897, + "tasks": [], + "average_score": 0.43580017776139807 + }, + "Spatial and Temporal Reasoning": { + "count": 78, + "num_samples": 1273, + "tasks": [], + "average_score": 0.367367170491919 + }, + "Planning and Decision Making": { + "count": 23, + "num_samples": 356, + "tasks": [], + "average_score": 0.1474368760019346 + }, + "Commonsense and Social Reasoning": { + "count": 38, + "num_samples": 654, + "tasks": [], + "average_score": 0.5782670824874114 + }, + "Ethical and Safety Reasoning": { + "count": 10, + "num_samples": 170, + "tasks": [], + "average_score": 0.7294097744360902 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 67, + "num_samples": 1123, + "tasks": [], + "average_score": 0.5070634902661117 + }, + "Text-Based Images and Documents": { + "count": 53, + "num_samples": 847, + "tasks": [], + "average_score": 0.4333175250859433 + }, + "Photographs": { + "count": 83, + "num_samples": 1315, + "tasks": [], + "average_score": 0.5308367876160253 + }, + "Diagrams and Data Visualizations": { + "count": 88, + "num_samples": 1524, + "tasks": [], + "average_score": 0.4473618716373871 + }, + "Artistic and Creative Content": { + "count": 22, + "num_samples": 389, + "tasks": [], + "average_score": 0.5251544991587351 + }, + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.18309869697155778 + } + }, + "output_format": { + "structured_output": { + "count": 72, + "num_samples": 1121, + "tasks": [], + "average_score": 0.49191356756271953 + }, + "contextual_formatted_text": { + "count": 63, + "num_samples": 975, + "tasks": [], + "average_score": 0.45605294241715827 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.4608929319719144 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.44066773476234555 + }, + "open_ended_output": { + "count": 51, + "num_samples": 991, + "tasks": [], + "average_score": 0.4974532098882374 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.5851458306003763 + } + }, + "input_num": { + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.4834475464413966 + } + }, + "app": { + "Information_Extraction": { + "count": 41, + "num_samples": 644, + "tasks": [], + "average_score": 0.6323628750211533 + }, + "Planning": { + "count": 44, + "num_samples": 714, + "tasks": [], + "average_score": 0.23323065689783842 + }, + "Coding": { + "count": 16, + "num_samples": 244, + "tasks": [], + "average_score": 0.48352372198879545 + }, + "Perception": { + "count": 82, + "num_samples": 1321, + "tasks": [], + "average_score": 0.6141225191470527 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.39365079365079364 + }, + "Science": { + "count": 22, + "num_samples": 469, + "tasks": [], + "average_score": 0.41914085094672937 + }, + "Knowledge": { + "count": 77, + "num_samples": 1294, + "tasks": [], + "average_score": 0.4874613649312476 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.3355316008767396 + } + } + } +} \ No newline at end of file diff --git a/static/eval_results/SI/Qwen2_VL_72B/task_results.json b/static/eval_results/SI/Qwen2_VL_72B/task_results.json new file mode 100644 index 0000000000000000000000000000000000000000..723efdfd25a66a6bcdb1251d77044c84327645ee --- /dev/null +++ b/static/eval_results/SI/Qwen2_VL_72B/task_results.json @@ -0,0 +1,2207 @@ +[ + { + "name": "science_molecule_chemistry", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "signboard_identification", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "funsd_document_qa", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "physical_property_reasoning", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_area", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_analytic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "3d_indoor_scene_text_bbox_selection", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "map_diagram_qa", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_connectivity", + "score": 0.38333333333333336, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "graph_isomorphism", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "question_solution_solving", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "snli_ve_visual_entailment", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "ti_fused_vqa_chemistry", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ili_ratio_future_prediction", + "score": 0.35550000000000015, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "image_style_recognition", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "multilingual_movie_info_parsing", + "score": 0.5510204081632651, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_transformation", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "vibe_eval_short_phrase", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "math_breakpoint", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "waybill_number_sequence_extraction", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "brand_logo_recognition_and_elaboration", + "score": 0.84, + "eval_type": "rule", + "num_demo": 1, + "num_query": 25 + }, + { + "name": "license_plate_recognition", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_puzzle_single_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "code_programming_test_easy", + "score": 0.22916666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 24 + }, + { + "name": "chess_winner_identification", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "chart_vqa", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "math_convexity_value_estimation", + "score": 0.4347808225785918, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "3d_indoor_scene_text_bbox_prediction", + "score": 0.1519116796574013, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "long_string_letter_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "dvqa", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "egocentric_spatial_reasoning", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "quizlet_question_solving", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "coco_person_detection", + "score": 0.7396930070845659, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_maxflow", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "tqa_textbook_qa", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "game_info_parsing", + "score": 0.9415584415584416, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "animal_pose_estimation", + "score": 0.25316516240287096, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "humor_understand_caption_match", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "table_understanding", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "super_clevr", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "average_humidity_estimate_plot", + "score": 0.6546666666666668, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "nlvr2_two_image_compare_qa", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "face_keypoint_detection", + "score": 0.6090597441949396, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "insect_order_classification", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "deciphering_oracle_bone", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "places365_scene_type_classification", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "science_basic_physics", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "long_string_number_recognition", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "handwritten_math_expression_extraction", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "newspaper_ocr_in_query_box", + "score": 0.8666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "graph_theory", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "landmark_recognition_and_qa", + "score": 0.5535353535353535, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "geometry_solid", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "top_rated_hotel_identification", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "stock_info_parsing", + "score": 0.9747899159663866, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pmc_vqa_medical_image_qa", + "score": 0.5263157894736842, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "electricity_future_prediction_from_table", + "score": 0.7573684210526316, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "TV_show_info_parsing", + "score": 0.761904761904762, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "coco_object_detection_by_query_property", + "score": 0.7785773738753342, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "figureqa", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ti_fused_vqa_biology", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "traffic_future_prediction_from_line_plot", + "score": 0.6320526315789474, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "mvsa_sentiment_classification", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "stock_price_future_prediction", + "score": 0.7822142857142859, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "kvqa_knowledge_aware_qa", + "score": 0.5263157894736842, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "hotel_booking_confirmation_parsing", + "score": 0.6785714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_shortest_path_planar", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "math_parity", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "famous_building_recognition", + "score": 0.8125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "multilingual_game_info_parsing", + "score": 0.7410714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "weather_info_parsing", + "score": 0.9404761904761906, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_shortest_path_kamada_kawai", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "widerface_face_count_and_event_classification", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ti_fused_vqa_physics", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_chordless_cycle", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_length", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "algebra", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "human_relationship_reasoning", + "score": 0.9375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "exchange_rate_estimate_plot", + "score": 0.9441071428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "electricity_load_estimate_plot", + "score": 0.6255, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "single_person_pose_estimation", + "score": 0.273716885964573, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ti_fused_vqa_math", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "electricity_plot_future_prediction", + "score": 0.6151315789473685, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "iconqa", + "score": 0.47368421052631576, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "movie_info_parsing", + "score": 0.6339285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "youtube_video_info_parsing", + "score": 0.7261904761904762, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "egocentric_analysis_single_image", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "music_info_parsing", + "score": 0.7321428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mnist_pattern", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "newspaper_page_parse_and_count", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "geometry_descriptive", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_relation", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_depth", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_distance", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_count", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "symbolic_graphics_programs_computer_aided_design", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "symbolic_graphics_programs_scalable_vector_graphics", + "score": 0.16666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18 + }, + { + "name": "multiple_states_identify_asia", + "score": 0.27142857142857146, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_americas", + "score": 0.3142857142857144, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_europe", + "score": 0.15714285714285717, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_africa", + "score": 0.1142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "flowchart_code_generation", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "healthcare_info_judgement", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "relative_depth_of_different_points", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_visual_storage", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "Ad_count_detection", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_sygyzy_endgames", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "web_action_grounding", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "position_relationship", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "GUI_Act_Web_Multi", + "score": 0.469394874670551, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "webpage_code_understanding", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "extract_webpage_headline", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "music_sheet_note_count", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "geometry_reasoning_circled_letter", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "logical_reasoning_find_odd_one_out", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "paper_vqa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "remaining_playback_time_calculation", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "interpret_force_perspective_illusion", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "GUI_Act_Web_Single", + "score": 0.021690583803327123, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "hashtag_recommendation", + "score": 0.9, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_tyreworld", + "score": 0.8666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "weather_map_climate_type_temperature_parsing", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "scibench_calculus_wo_solution", + "score": 0.1836734693877551, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49 + }, + { + "name": "MMMU_physics_chemistry_MCQ", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "image_translation_en2cn", + "score": 0.4790877604978875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "game_platform_support_identification", + "score": 0.75, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "music_sheet_format_QA", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cultural_vqa", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "CLEVRER_physics", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20 + }, + { + "name": "location_vqa", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_nested_squares", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "circuit_diagram_understanding", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "scibench_fundamental_wo_solution", + "score": 0.20408163265306123, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49 + }, + { + "name": "medical_cell_recognition", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "planning_screenshot_blocksworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "planning_screenshot_storage", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "distinguish_ai_generated_image", + "score": 0.6842105263157895, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "geometry_reasoning_count_line_intersections", + "score": 0.4642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_grippers", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "llavaguard", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mindmap_elements_parsing", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "autonomous_driving_scene_analysis", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "realworld_qa_en2cn", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "orchestra_score_recognition", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "highest_discount_game_price_identification", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "counting", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "media_QA_web_stackoverflow", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "soccer_offside", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "song_title_identification_from_lyrics", + "score": 0.5357142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMMU_pro_exam_screenshot", + "score": 0.3939393939393939, + "eval_type": "rule", + "num_demo": 1, + "num_query": 99 + }, + { + "name": "medical_multi_organ_segmentation_rater", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "knowledge_graph_understanding", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "mensa_iq_test", + "score": 0.4504901960784313, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "play_go_capture_stone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "medical_blood_vessels_recognition", + "score": 0.5357142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "web_action_prediction", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "arxiv_vqa", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "top_video_creator_identification", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "annoying_word_search", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "GUI_Act_Mobile_tap", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_grid", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "GUI_Act_Mobile_swipe", + "score": 0.4897466817225849, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "recover_masked_word_in_figure", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "character_recognition_in_TV_shows", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "relative_reflectance_of_different_regions", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_image_artifacts_indentification", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "signage_navigation", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "transit_map_intersection_points", + "score": 0.4583333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "code_execution", + "score": 0.125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "icon_arithmetic_puzzle", + "score": 0.8214285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_overlapped_circle", + "score": 0.6071428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chinese_idiom_recognition", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "monthly_weather_days_count", + "score": 0.40476190476190477, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "calendar_schedule_suggestion", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ascii_art_understanding", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_find_legal_moves", + "score": 0.026882674514698886, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "topological_sort", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "entertainment_web_game_style", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "polygon_interior_angles", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mahjong", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "font_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "number_comparison", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_counting_lymphocytes", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "actor_recognition_in_Movie", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "product_ocr_qa", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "Bongard_Problem", + "score": 0.21052631578947367, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "logical_reasoning_fit_pattern", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "road_map_find_highway_between_two_place", + "score": 0.4117647058823529, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "ishihara_test", + "score": 0.2571428571428572, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "LaTeX_complex_formula_convertion", + "score": 0.29411764705882354, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "rebus", + "score": 0.21739130434782608, + "eval_type": "rule", + "num_demo": 1, + "num_query": 23 + }, + { + "name": "constrained_generation_contain_position_length", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_repeat_length", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_contain_length", + "score": 1.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_length", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "chess_puzzles_equality", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "chess_puzzles_crushing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_puzzles_checkmate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "panel_images_single_question", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "panel_images_multi_question", + "score": 0.9047619047619048, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "crossword_mini_5x5", + "score": 0.3642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_HatefulMemes", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Misinformation_GossipCop", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Misinformation_PolitiFact", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Memotion", + "score": 0.5529411764705883, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "reward_models_I2T_reward", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_MATH", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "ocr_math_TheoremQA", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "counterfactual_arithmetic", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_homepage", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_publication", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_blogpost", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_complex_question_answering", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_fact_verification", + "score": 0.857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_face_swap", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_text_style", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_clip_stable_diffusion_generate", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_veracity", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_face_attribute_edit", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_text_entity_replace", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_out_of_context", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_background_change", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "number_puzzle_kakuro_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "number_puzzle_sudoku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "maze_2d_8x8", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "shape_composition_shapes", + "score": 0.46811224489795916, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "shape_composition_colours", + "score": 0.4113378684807256, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_chinese_celebrity", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_indian_celebrity", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_papers", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "memorization_famous_treaty", + "score": 0.6071428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_csv", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_school_plain", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_text_latex", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_skill_plain", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_experience_plain", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_article_authors", + "score": 0.8214285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_markdown", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_html", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_employer_plain", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_equation", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_article_journal", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_latex", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "autorater_artifact", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "autorater_artifact_reason", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_haiku", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_acrostic", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_limerick", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_petrarchian_sonnet_optional_meter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_custom_rhyming_scheme", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_acrostic_alliteration", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_shakespearean_sonnet", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "pictionary_cartoon_drawing_guess", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_doodle_guess", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "pictionary_skribbl_io", + "score": 0.15, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20 + }, + { + "name": "pictionary_genai_output_chinese", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_chinese_food_img2en", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_tiktok", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_notes", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_alipay", + "score": 0.7647058823529411, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "app_layout_understanding_amazon", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_instagram", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_ppt", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_leetcode", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_twitter", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_zoom", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "app_layout_understanding_iphone_settings", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_youtube", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_excel", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_word", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "scibench_w_solution_open_ended", + "score": 0.29200000000000004, + "eval_type": "llm", + "num_demo": 1, + "num_query": 25 + }, + { + "name": "electrocardiogram", + "score": 0.3071428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "funny_image_title", + "score": 0.6928571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "image_captioning_with_additional_requirements", + "score": 0.8714285714285716, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "meme_explain", + "score": 0.75, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "image_humor_understanding", + "score": 0.793103448275862, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "tweets_captioning", + "score": 0.5857142857142856, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "defeasible_reasoning", + "score": 0.7413793103448277, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "vibe-eval", + "score": 0.6928571428571432, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocrqa", + "score": 0.813793103448276, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "figurative_speech_explanation", + "score": 0.724137931034483, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "docci_image_description_long", + "score": 0.75, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bar_chart_interpretation", + "score": 0.6241379310344828, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "iq_test", + "score": 0.43793103448275855, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "unusual_images", + "score": 0.7827586206896552, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "GUI_Chat_Hard", + "score": 0.6516129032258065, + "eval_type": "llm", + "num_demo": 1, + "num_query": 31 + }, + { + "name": "graph_interpretation", + "score": 0.7448275862068966, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "traffic_accident_analysis", + "score": 0.4785714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "humor_explanation", + "score": 0.44666666666666666, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "GUI_Chat_Easy", + "score": 0.7307692307692307, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26 + }, + { + "name": "table2latex_complex", + "score": 0.6888888888888888, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "visualization_with_code", + "score": 0.5499999999999999, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "science_figure_explanation", + "score": 0.7413793103448275, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "guess_image_generation_prompt", + "score": 0.7894736842105264, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "art_explanation", + "score": 0.6241379310344829, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "bridge_strategies_worldclass", + "score": 0.2928571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bridge_strategies_expert", + "score": 0.19999999999999996, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bridge_strategies_advanced", + "score": 0.2, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Spanish", + "score": 0.16428571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Arabic", + "score": 0.45714285714285713, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Japanese", + "score": 0.36428571428571427, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Russian", + "score": 0.35, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_English", + "score": 0.507142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_French", + "score": 0.1142857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_fetaqa", + "score": 0.6357142857142858, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "red_teaming_racial", + "score": 0.6949999999999998, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_captcha", + "score": 0.11052631578947371, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "red_teaming_celebrity", + "score": 0.6250000000000001, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_politics", + "score": 0.6449999999999999, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_visualmisleading", + "score": 0.910526315789474, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "red_teaming_jailbreak", + "score": 0.5650000000000001, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "ascii_art_30", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + } +] \ No newline at end of file diff --git a/static/eval_results/SI/Qwen2_VL_7B/summary_results.json b/static/eval_results/SI/Qwen2_VL_7B/summary_results.json new file mode 100644 index 0000000000000000000000000000000000000000..c602e711d4391693f5f1065275958be22caa3a30 --- /dev/null +++ b/static/eval_results/SI/Qwen2_VL_7B/summary_results.json @@ -0,0 +1,219 @@ +{ + "model_summary": { + "core": { + "num_eval_tasks": 273, + "num_eval_samples": 4116, + "num_not_eval_samples": 0, + "macro_mean_score": 0.3538656561495699, + "micro_mean_score": 0.34581250459157137, + "missing_tasks": [] + }, + "open": { + "num_eval_tasks": 42, + "num_eval_samples": 813, + "num_not_eval_samples": 0, + "macro_mean_score": 0.4517429592549692, + "micro_mean_score": 0.4730012300123002, + "missing_tasks": [] + }, + "overall_score": 0.3669159632302898 + }, + "keyword_stats": { + "skills": { + "Text Recognition (OCR)": { + "count": 101, + "num_samples": 1687, + "tasks": [], + "average_score": 0.40533138482347386 + }, + "Object Recognition and Classification": { + "count": 172, + "num_samples": 2714, + "tasks": [], + "average_score": 0.3844930054666535 + }, + "Scene and Event Understanding": { + "count": 60, + "num_samples": 1004, + "tasks": [], + "average_score": 0.5151962864568788 + }, + "Mathematical and Logical Reasoning": { + "count": 91, + "num_samples": 1630, + "tasks": [], + "average_score": 0.28799562910106935 + }, + "Language Understanding and Generation": { + "count": 102, + "num_samples": 1713, + "tasks": [], + "average_score": 0.41129100495999377 + }, + "Domain-Specific Knowledge and Skills": { + "count": 46, + "num_samples": 897, + "tasks": [], + "average_score": 0.31735419703044254 + }, + "Spatial and Temporal Reasoning": { + "count": 78, + "num_samples": 1273, + "tasks": [], + "average_score": 0.2780300019986884 + }, + "Planning and Decision Making": { + "count": 23, + "num_samples": 356, + "tasks": [], + "average_score": 0.08484497782236566 + }, + "Commonsense and Social Reasoning": { + "count": 38, + "num_samples": 654, + "tasks": [], + "average_score": 0.4927960336040459 + }, + "Ethical and Safety Reasoning": { + "count": 10, + "num_samples": 170, + "tasks": [], + "average_score": 0.6569285714285714 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 67, + "num_samples": 1123, + "tasks": [], + "average_score": 0.3555822260000372 + }, + "Text-Based Images and Documents": { + "count": 53, + "num_samples": 847, + "tasks": [], + "average_score": 0.27651089530142536 + }, + "Photographs": { + "count": 83, + "num_samples": 1315, + "tasks": [], + "average_score": 0.4722059967533397 + }, + "Diagrams and Data Visualizations": { + "count": 88, + "num_samples": 1524, + "tasks": [], + "average_score": 0.3279988413468837 + }, + "Artistic and Creative Content": { + "count": 22, + "num_samples": 389, + "tasks": [], + "average_score": 0.39575781162159634 + }, + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.167887099917599 + } + }, + "output_format": { + "structured_output": { + "count": 72, + "num_samples": 1121, + "tasks": [], + "average_score": 0.38908261098680974 + }, + "contextual_formatted_text": { + "count": 63, + "num_samples": 975, + "tasks": [], + "average_score": 0.3343930759222326 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.3068323820854221 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.31569247186288174 + }, + "open_ended_output": { + "count": 51, + "num_samples": 991, + "tasks": [], + "average_score": 0.39180263622429157 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.5064978792251521 + } + }, + "input_num": { + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.3669159632302898 + } + }, + "app": { + "Information_Extraction": { + "count": 41, + "num_samples": 644, + "tasks": [], + "average_score": 0.44618631789079277 + }, + "Planning": { + "count": 44, + "num_samples": 714, + "tasks": [], + "average_score": 0.13224920829749706 + }, + "Coding": { + "count": 16, + "num_samples": 244, + "tasks": [], + "average_score": 0.36572347689075624 + }, + "Perception": { + "count": 82, + "num_samples": 1321, + "tasks": [], + "average_score": 0.47845712831657317 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.3507936507936508 + }, + "Science": { + "count": 22, + "num_samples": 469, + "tasks": [], + "average_score": 0.28910547521894076 + }, + "Knowledge": { + "count": 77, + "num_samples": 1294, + "tasks": [], + "average_score": 0.40527029084195965 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.25874500882297563 + } + } + } +} \ No newline at end of file diff --git a/static/eval_results/SI/Qwen2_VL_7B/task_results.json b/static/eval_results/SI/Qwen2_VL_7B/task_results.json new file mode 100644 index 0000000000000000000000000000000000000000..2a50f6e2038e76e1c2ba23de9f9b41e73cd16852 --- /dev/null +++ b/static/eval_results/SI/Qwen2_VL_7B/task_results.json @@ -0,0 +1,2207 @@ +[ + { + "name": "science_molecule_chemistry", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "signboard_identification", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "funsd_document_qa", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "physical_property_reasoning", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_area", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_analytic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "3d_indoor_scene_text_bbox_selection", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "map_diagram_qa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_connectivity", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "graph_isomorphism", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "question_solution_solving", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "snli_ve_visual_entailment", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "ti_fused_vqa_chemistry", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ili_ratio_future_prediction", + "score": 0.08607142857142856, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "image_style_recognition", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "multilingual_movie_info_parsing", + "score": 0.5306122448979592, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_transformation", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "vibe_eval_short_phrase", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "math_breakpoint", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "waybill_number_sequence_extraction", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "brand_logo_recognition_and_elaboration", + "score": 0.64, + "eval_type": "rule", + "num_demo": 1, + "num_query": 25 + }, + { + "name": "license_plate_recognition", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_puzzle_single_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "code_programming_test_easy", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 24 + }, + { + "name": "chess_winner_identification", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "chart_vqa", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "math_convexity_value_estimation", + "score": 0.269528925627328, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "3d_indoor_scene_text_bbox_prediction", + "score": 0.12148848554948376, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "long_string_letter_recognition", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "dvqa", + "score": 0.631578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "egocentric_spatial_reasoning", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "quizlet_question_solving", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "coco_person_detection", + "score": 0.7309631423965609, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_maxflow", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "tqa_textbook_qa", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "game_info_parsing", + "score": 0.8701298701298701, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "animal_pose_estimation", + "score": 0.20139246340756983, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "humor_understand_caption_match", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "table_understanding", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "super_clevr", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "average_humidity_estimate_plot", + "score": 0.4426666666666668, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "nlvr2_two_image_compare_qa", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "face_keypoint_detection", + "score": 0.848854419078294, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "insect_order_classification", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "deciphering_oracle_bone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "places365_scene_type_classification", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "science_basic_physics", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "long_string_number_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "handwritten_math_expression_extraction", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "newspaper_ocr_in_query_box", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "graph_theory", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "landmark_recognition_and_qa", + "score": 0.6222222222222221, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "geometry_solid", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "top_rated_hotel_identification", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "stock_info_parsing", + "score": 0.9705882352941178, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pmc_vqa_medical_image_qa", + "score": 0.3684210526315789, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "electricity_future_prediction_from_table", + "score": 0.7178947368421053, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "TV_show_info_parsing", + "score": 0.738095238095238, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "coco_object_detection_by_query_property", + "score": 0.6557012145398754, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "figureqa", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ti_fused_vqa_biology", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "traffic_future_prediction_from_line_plot", + "score": 0.4413684210526315, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "mvsa_sentiment_classification", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "stock_price_future_prediction", + "score": 0.7007142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "kvqa_knowledge_aware_qa", + "score": 0.42105263157894735, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "hotel_booking_confirmation_parsing", + "score": 0.5857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_shortest_path_planar", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "math_parity", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "famous_building_recognition", + "score": 0.78125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "multilingual_game_info_parsing", + "score": 0.6517857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "weather_info_parsing", + "score": 0.892857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_shortest_path_kamada_kawai", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "widerface_face_count_and_event_classification", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ti_fused_vqa_physics", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_chordless_cycle", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_length", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "algebra", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "human_relationship_reasoning", + "score": 0.875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "exchange_rate_estimate_plot", + "score": 0.9662571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "electricity_load_estimate_plot", + "score": 0.3987142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "single_person_pose_estimation", + "score": 0.1507488668719037, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ti_fused_vqa_math", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "electricity_plot_future_prediction", + "score": 0.5190473684210527, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "iconqa", + "score": 0.42105263157894735, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "movie_info_parsing", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "youtube_video_info_parsing", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "egocentric_analysis_single_image", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "music_info_parsing", + "score": 0.5089285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mnist_pattern", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "newspaper_page_parse_and_count", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "geometry_descriptive", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_relation", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_depth", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_distance", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_count", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "symbolic_graphics_programs_computer_aided_design", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "symbolic_graphics_programs_scalable_vector_graphics", + "score": 0.05555555555555555, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18 + }, + { + "name": "multiple_states_identify_asia", + "score": 0.08571428571428572, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_americas", + "score": 0.09999999999999999, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_europe", + "score": 0.042857142857142864, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_africa", + "score": 0.028571428571428574, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "flowchart_code_generation", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "healthcare_info_judgement", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "relative_depth_of_different_points", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_visual_storage", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "Ad_count_detection", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_sygyzy_endgames", + "score": 0.06598639455782314, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "web_action_grounding", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "position_relationship", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "GUI_Act_Web_Multi", + "score": 0.43082396072826695, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "webpage_code_understanding", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "extract_webpage_headline", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "music_sheet_note_count", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "geometry_reasoning_circled_letter", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "logical_reasoning_find_odd_one_out", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "paper_vqa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "remaining_playback_time_calculation", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "interpret_force_perspective_illusion", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "GUI_Act_Web_Single", + "score": 0.0801904200671749, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "hashtag_recommendation", + "score": 0.863095238095238, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_tyreworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "weather_map_climate_type_temperature_parsing", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "scibench_calculus_wo_solution", + "score": 0.04081632653061224, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49 + }, + { + "name": "MMMU_physics_chemistry_MCQ", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "image_translation_en2cn", + "score": 0.27993884030889277, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "game_platform_support_identification", + "score": 0.17857142857142858, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "music_sheet_format_QA", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cultural_vqa", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "CLEVRER_physics", + "score": 0.45, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20 + }, + { + "name": "location_vqa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_nested_squares", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "circuit_diagram_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "scibench_fundamental_wo_solution", + "score": 0.10204081632653061, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49 + }, + { + "name": "medical_cell_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "planning_screenshot_blocksworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "planning_screenshot_storage", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "distinguish_ai_generated_image", + "score": 0.6842105263157895, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "geometry_reasoning_count_line_intersections", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_grippers", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "llavaguard", + "score": 0.8214285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mindmap_elements_parsing", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "autonomous_driving_scene_analysis", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "realworld_qa_en2cn", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "orchestra_score_recognition", + "score": 0.10714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "highest_discount_game_price_identification", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "counting", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "media_QA_web_stackoverflow", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "soccer_offside", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "song_title_identification_from_lyrics", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMMU_pro_exam_screenshot", + "score": 0.25252525252525254, + "eval_type": "rule", + "num_demo": 1, + "num_query": 99 + }, + { + "name": "medical_multi_organ_segmentation_rater", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "knowledge_graph_understanding", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "mensa_iq_test", + "score": 0.296078431372549, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "play_go_capture_stone", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "medical_blood_vessels_recognition", + "score": 0.6071428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "web_action_prediction", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "arxiv_vqa", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "top_video_creator_identification", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "annoying_word_search", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "GUI_Act_Mobile_tap", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_grid", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "GUI_Act_Mobile_swipe", + "score": 0.3027852300484236, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "recover_masked_word_in_figure", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "character_recognition_in_TV_shows", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "relative_reflectance_of_different_regions", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_image_artifacts_indentification", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "signage_navigation", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "transit_map_intersection_points", + "score": 0.06872294372294371, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "code_execution", + "score": 0.0625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "icon_arithmetic_puzzle", + "score": 0.4642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_overlapped_circle", + "score": 0.4642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chinese_idiom_recognition", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "monthly_weather_days_count", + "score": 0.2619047619047619, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "calendar_schedule_suggestion", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ascii_art_understanding", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_find_legal_moves", + "score": 0.03355324641748354, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "topological_sort", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "entertainment_web_game_style", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "polygon_interior_angles", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mahjong", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "font_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "number_comparison", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_counting_lymphocytes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "actor_recognition_in_Movie", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "product_ocr_qa", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "Bongard_Problem", + "score": 0.15789473684210525, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "logical_reasoning_fit_pattern", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "road_map_find_highway_between_two_place", + "score": 0.29411764705882354, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "ishihara_test", + "score": 0.21428571428571433, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "LaTeX_complex_formula_convertion", + "score": 0.11764705882352941, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "rebus", + "score": 0.08695652173913043, + "eval_type": "rule", + "num_demo": 1, + "num_query": 23 + }, + { + "name": "constrained_generation_contain_position_length", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_repeat_length", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_contain_length", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_length", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "chess_puzzles_equality", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "chess_puzzles_crushing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_puzzles_checkmate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "panel_images_single_question", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "panel_images_multi_question", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "crossword_mini_5x5", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_HatefulMemes", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Misinformation_GossipCop", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Misinformation_PolitiFact", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Memotion", + "score": 0.6705882352941176, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "reward_models_I2T_reward", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_MATH", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "ocr_math_TheoremQA", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "counterfactual_arithmetic", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_homepage", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_publication", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_blogpost", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_complex_question_answering", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_fact_verification", + "score": 0.6666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_face_swap", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_text_style", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_clip_stable_diffusion_generate", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_veracity", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_face_attribute_edit", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_text_entity_replace", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_out_of_context", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_background_change", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "number_puzzle_kakuro_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "number_puzzle_sudoku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "maze_2d_8x8", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "shape_composition_shapes", + "score": 0.28852040816326535, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "shape_composition_colours", + "score": 0.30960884353741497, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_chinese_celebrity", + "score": 0.17857142857142858, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_indian_celebrity", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_papers", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "memorization_famous_treaty", + "score": 0.32142857142857145, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_csv", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_school_plain", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_text_latex", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_skill_plain", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_experience_plain", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_article_authors", + "score": 0.8214285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_markdown", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_html", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_employer_plain", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_equation", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_article_journal", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_latex", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "autorater_artifact", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "autorater_artifact_reason", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_haiku", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_acrostic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_limerick", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_petrarchian_sonnet_optional_meter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_custom_rhyming_scheme", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_acrostic_alliteration", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_shakespearean_sonnet", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "pictionary_cartoon_drawing_guess", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_doodle_guess", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "pictionary_skribbl_io", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20 + }, + { + "name": "pictionary_genai_output_chinese", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_chinese_food_img2en", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_tiktok", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_notes", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_alipay", + "score": 0.29411764705882354, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "app_layout_understanding_amazon", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_instagram", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_ppt", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_leetcode", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_twitter", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_zoom", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "app_layout_understanding_iphone_settings", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_youtube", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_excel", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_word", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "scibench_w_solution_open_ended", + "score": 0.254, + "eval_type": "llm", + "num_demo": 1, + "num_query": 25 + }, + { + "name": "electrocardiogram", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "funny_image_title", + "score": 0.5857142857142856, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "image_captioning_with_additional_requirements", + "score": 0.742857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "meme_explain", + "score": 0.65, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "image_humor_understanding", + "score": 0.5172413793103449, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "tweets_captioning", + "score": 0.6071428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "defeasible_reasoning", + "score": 0.6241379310344827, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "vibe-eval", + "score": 0.5499999999999999, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocrqa", + "score": 0.7172413793103447, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "figurative_speech_explanation", + "score": 0.6137931034482759, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "docci_image_description_long", + "score": 0.7071428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bar_chart_interpretation", + "score": 0.44137931034482775, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "iq_test", + "score": 0.3551724137931035, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "unusual_images", + "score": 0.48620689655172405, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "GUI_Chat_Hard", + "score": 0.6225806451612903, + "eval_type": "llm", + "num_demo": 1, + "num_query": 31 + }, + { + "name": "graph_interpretation", + "score": 0.5, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "traffic_accident_analysis", + "score": 0.10714285714285716, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "humor_explanation", + "score": 0.32, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "GUI_Chat_Easy", + "score": 0.7038461538461539, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26 + }, + { + "name": "table2latex_complex", + "score": 0.4555555555555556, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "visualization_with_code", + "score": 0.30714285714285705, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "science_figure_explanation", + "score": 0.48965517241379297, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "guess_image_generation_prompt", + "score": 0.7631578947368421, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "art_explanation", + "score": 0.3413793103448276, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "bridge_strategies_worldclass", + "score": 0.18571428571428572, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bridge_strategies_expert", + "score": 0.32142857142857145, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bridge_strategies_advanced", + "score": 0.1928571428571429, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Spanish", + "score": 0.39285714285714285, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Arabic", + "score": 0.05714285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Japanese", + "score": 0.2642857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Russian", + "score": 0.1285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_English", + "score": 0.4714285714285715, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_French", + "score": 0.2642857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_fetaqa", + "score": 0.4785714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "red_teaming_racial", + "score": 0.75, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_captcha", + "score": 0.10000000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "red_teaming_celebrity", + "score": 0.6700000000000002, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_politics", + "score": 0.615, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_visualmisleading", + "score": 0.9, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "red_teaming_jailbreak", + "score": 0.29, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "ascii_art_30", + "score": 0.42857142857142855, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + } +] \ No newline at end of file diff --git a/static/eval_results/SI/SmolVLM/summary_results.json b/static/eval_results/SI/SmolVLM/summary_results.json new file mode 100644 index 0000000000000000000000000000000000000000..97be21070ed94838e45c9cd7983b884ba1236b63 --- /dev/null +++ b/static/eval_results/SI/SmolVLM/summary_results.json @@ -0,0 +1,219 @@ +{ + "model_summary": { + "core": { + "num_eval_tasks": 273, + "num_eval_samples": 4116, + "num_not_eval_samples": 0, + "macro_mean_score": 0.07348385181460795, + "micro_mean_score": 0.0732694668402814, + "missing_tasks": [] + }, + "open": { + "num_eval_tasks": 42, + "num_eval_samples": 813, + "num_not_eval_samples": 0, + "macro_mean_score": 0.2427337975725658, + "micro_mean_score": 0.2504920049200492, + "missing_tasks": [] + }, + "overall_score": 0.09605051124900234 + }, + "keyword_stats": { + "skills": { + "Text Recognition (OCR)": { + "count": 101, + "num_samples": 1687, + "tasks": [], + "average_score": 0.08610257462374318 + }, + "Object Recognition and Classification": { + "count": 172, + "num_samples": 2714, + "tasks": [], + "average_score": 0.10501451629704919 + }, + "Scene and Event Understanding": { + "count": 60, + "num_samples": 1004, + "tasks": [], + "average_score": 0.12403047579230878 + }, + "Mathematical and Logical Reasoning": { + "count": 91, + "num_samples": 1630, + "tasks": [], + "average_score": 0.0865768026006882 + }, + "Language Understanding and Generation": { + "count": 102, + "num_samples": 1713, + "tasks": [], + "average_score": 0.12889143083611815 + }, + "Domain-Specific Knowledge and Skills": { + "count": 46, + "num_samples": 897, + "tasks": [], + "average_score": 0.077851045512787 + }, + "Spatial and Temporal Reasoning": { + "count": 78, + "num_samples": 1273, + "tasks": [], + "average_score": 0.061765081348496016 + }, + "Planning and Decision Making": { + "count": 23, + "num_samples": 356, + "tasks": [], + "average_score": 0.008178053830227744 + }, + "Commonsense and Social Reasoning": { + "count": 38, + "num_samples": 654, + "tasks": [], + "average_score": 0.1293055688222371 + }, + "Ethical and Safety Reasoning": { + "count": 10, + "num_samples": 170, + "tasks": [], + "average_score": 0.2222067669172932 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 67, + "num_samples": 1123, + "tasks": [], + "average_score": 0.029842216842698305 + }, + "Text-Based Images and Documents": { + "count": 53, + "num_samples": 847, + "tasks": [], + "average_score": 0.09044016512537822 + }, + "Photographs": { + "count": 83, + "num_samples": 1315, + "tasks": [], + "average_score": 0.1383108182448921 + }, + "Diagrams and Data Visualizations": { + "count": 88, + "num_samples": 1524, + "tasks": [], + "average_score": 0.0979843882877799 + }, + "Artistic and Creative Content": { + "count": 22, + "num_samples": 389, + "tasks": [], + "average_score": 0.143657576543239 + }, + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.10013149786398344 + } + }, + "output_format": { + "structured_output": { + "count": 72, + "num_samples": 1121, + "tasks": [], + "average_score": 0.015386904208215372 + }, + "contextual_formatted_text": { + "count": 63, + "num_samples": 975, + "tasks": [], + "average_score": 0.12682789970863723 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.05128016118728194 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.09999979828107199 + }, + "open_ended_output": { + "count": 51, + "num_samples": 991, + "tasks": [], + "average_score": 0.21315705831839693 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.10496742314924135 + } + }, + "input_num": { + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.09605051124900241 + } + }, + "app": { + "Information_Extraction": { + "count": 41, + "num_samples": 644, + "tasks": [], + "average_score": 0.03906165844850793 + }, + "Planning": { + "count": 44, + "num_samples": 714, + "tasks": [], + "average_score": 0.03272316763696074 + }, + "Coding": { + "count": 16, + "num_samples": 244, + "tasks": [], + "average_score": 0.05390625 + }, + "Perception": { + "count": 82, + "num_samples": 1321, + "tasks": [], + "average_score": 0.1606753925138995 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.2222222222222222 + }, + "Science": { + "count": 22, + "num_samples": 469, + "tasks": [], + "average_score": 0.13950042461525716 + }, + "Knowledge": { + "count": 77, + "num_samples": 1294, + "tasks": [], + "average_score": 0.09639506190200878 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.06728619034079576 + } + } + } +} \ No newline at end of file diff --git a/static/eval_results/SI/SmolVLM/task_results.json b/static/eval_results/SI/SmolVLM/task_results.json new file mode 100644 index 0000000000000000000000000000000000000000..df3d9af11d432c21a23c5e3b469fa763a71857e6 --- /dev/null +++ b/static/eval_results/SI/SmolVLM/task_results.json @@ -0,0 +1,2207 @@ +[ + { + "name": "logical_reasoning_find_odd_one_out", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cultural_vqa", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "logical_reasoning_fit_pattern", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "relative_depth_of_different_points", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "autonomous_driving_scene_analysis", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_count_line_intersections", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "llavaguard", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "knowledge_graph_understanding", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "recover_masked_word_in_figure", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "paper_vqa", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "product_ocr_qa", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "signage_navigation", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "Ad_count_detection", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "flowchart_code_generation", + "score": 0.1111111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "song_title_identification_from_lyrics", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMMU_pro_exam_screenshot", + "score": 0.09090909090909091, + "eval_type": "rule", + "num_demo": 1, + "num_query": 99 + }, + { + "name": "monthly_weather_days_count", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_visual_storage", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "geometry_reasoning_nested_squares", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "GUI_Act_Web_Multi", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mensa_iq_test", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "play_go_capture_stone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "medical_multi_organ_segmentation_rater", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMMU_physics_chemistry_MCQ", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "arxiv_vqa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_grippers", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "chinese_idiom_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "GUI_Act_Mobile_swipe", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "road_map_find_highway_between_two_place", + "score": 0.47058823529411764, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "medical_cell_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "position_relationship", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "font_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "Bongard_Problem", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "image_translation_en2cn", + "score": 0.06919903502760565, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "CLEVRER_physics", + "score": 0.1, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20 + }, + { + "name": "geometry_reasoning_overlapped_circle", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_image_artifacts_indentification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "GUI_Act_Mobile_tap", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_blood_vessels_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ishihara_test", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "character_recognition_in_TV_shows", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "game_platform_support_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "healthcare_info_judgement", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "circuit_diagram_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "planning_screenshot_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "counting", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "music_sheet_note_count", + "score": 0.11764705882352941, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "media_QA_web_stackoverflow", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "realworld_qa_en2cn", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_grid", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "location_vqa", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_circled_letter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "transit_map_intersection_points", + "score": 0.017857142857142856, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "webpage_code_understanding", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "chess_find_legal_moves", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "topological_sort", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mindmap_elements_parsing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "interpret_force_perspective_illusion", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "annoying_word_search", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "polygon_interior_angles", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "top_video_creator_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ascii_art_understanding", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "GUI_Act_Web_Single", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "calendar_schedule_suggestion", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "web_action_prediction", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "highest_discount_game_price_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "code_execution", + "score": 0.0625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "music_sheet_format_QA", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "actor_recognition_in_Movie", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "extract_webpage_headline", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "hashtag_recommendation", + "score": 0.6785714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "relative_reflectance_of_different_regions", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "planning_screenshot_blocksworld", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "soccer_offside", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "number_comparison", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_storage", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "scibench_calculus_wo_solution", + "score": 0.04081632653061224, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49 + }, + { + "name": "mahjong", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "chess_sygyzy_endgames", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "rebus", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 23 + }, + { + "name": "LaTeX_complex_formula_convertion", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "icon_arithmetic_puzzle", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "orchestra_score_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "web_action_grounding", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "remaining_playback_time_calculation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "weather_map_climate_type_temperature_parsing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "distinguish_ai_generated_image", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "medical_counting_lymphocytes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_tyreworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "entertainment_web_game_style", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "scibench_fundamental_wo_solution", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49 + }, + { + "name": "ti_fused_vqa_physics", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multilingual_movie_info_parsing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "electricity_load_estimate_plot", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pmc_vqa_medical_image_qa", + "score": 0.42105263157894735, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "movie_info_parsing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "code_programming_test_easy", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 24 + }, + { + "name": "insect_order_classification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "graph_shortest_path_kamada_kawai", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "human_relationship_reasoning", + "score": 0.375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "graph_shortest_path_planar", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "iconqa", + "score": 0.631578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "stock_info_parsing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "stock_price_future_prediction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_length", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "traffic_future_prediction_from_line_plot", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "ti_fused_vqa_biology", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "animal_pose_estimation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "average_humidity_estimate_plot", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "music_info_parsing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "nlvr2_two_image_compare_qa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_transformation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multilingual_game_info_parsing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "egocentric_analysis_single_image", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "landmark_recognition_and_qa", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "graph_connectivity", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "youtube_video_info_parsing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "physical_property_reasoning", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "license_plate_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "famous_building_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "TV_show_info_parsing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "image_style_recognition", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "graph_theory", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "electricity_plot_future_prediction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "mnist_pattern", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_isomorphism", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "long_string_letter_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "kvqa_knowledge_aware_qa", + "score": 0.10526315789473684, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "geometry_area", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "newspaper_ocr_in_query_box", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "ti_fused_vqa_math", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "coco_object_detection_by_query_property", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "3d_indoor_scene_text_bbox_prediction", + "score": 0.057405852870824024, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "top_rated_hotel_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "widerface_face_count_and_event_classification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "math_parity", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "snli_ve_visual_entailment", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "brand_logo_recognition_and_elaboration", + "score": 0.02, + "eval_type": "rule", + "num_demo": 1, + "num_query": 25 + }, + { + "name": "super_clevr", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "science_molecule_chemistry", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "handwritten_math_expression_extraction", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ti_fused_vqa_chemistry", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_puzzle_single_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "exchange_rate_estimate_plot", + "score": 0.4638221428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_chordless_cycle", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mvsa_sentiment_classification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "math_breakpoint", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "ili_ratio_future_prediction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "dvqa", + "score": 0.631578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "geometry_solid", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "tqa_textbook_qa", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "question_solution_solving", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "science_basic_physics", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "face_keypoint_detection", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "figureqa", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_winner_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "algebra", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "math_convexity_value_estimation", + "score": 0.060476150610554294, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "map_diagram_qa", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "deciphering_oracle_bone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "funsd_document_qa", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "newspaper_page_parse_and_count", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "weather_info_parsing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "vibe_eval_short_phrase", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "3d_indoor_scene_text_bbox_selection", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "signboard_identification", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "game_info_parsing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_analytic", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "long_string_number_recognition", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "waybill_number_sequence_extraction", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "egocentric_spatial_reasoning", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "single_person_pose_estimation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "quizlet_question_solving", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chart_vqa", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "electricity_future_prediction_from_table", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "humor_understand_caption_match", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "hotel_booking_confirmation_parsing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_descriptive", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "coco_person_detection", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_maxflow", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "places365_scene_type_classification", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_blogpost", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_publication", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_homepage", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "reward_models_I2T_reward", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "constrained_generation_contain_repeat_length", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_length", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_position_length", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_contain_length", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "panel_images_single_question", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "panel_images_multi_question", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_puzzles_checkmate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_puzzles_equality", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "chess_puzzles_crushing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_fact_verification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_complex_question_answering", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "number_puzzle_sudoku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "number_puzzle_kakuro_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "pictionary_chinese_food_img2en", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_skribbl_io", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20 + }, + { + "name": "pictionary_genai_output_chinese", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_doodle_guess", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "pictionary_cartoon_drawing_guess", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_face_swap", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_veracity", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_out_of_context", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_background_change", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_clip_stable_diffusion_generate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_text_style", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_text_entity_replace", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_face_attribute_edit", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "crossword_mini_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "autorater_artifact_reason", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "autorater_artifact", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "counterfactual_arithmetic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "poetry_acrostic_alliteration", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_shakespearean_sonnet", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_haiku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_petrarchian_sonnet_optional_meter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_acrostic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_limerick", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_custom_rhyming_scheme", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "shape_composition_shapes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "shape_composition_colours", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_article_authors", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_html", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_article_journal", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_skill_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_equation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_latex", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_experience_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_employer_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_text_latex", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_markdown", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_school_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_csv", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_indian_celebrity", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_chinese_celebrity", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_famous_treaty", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_papers", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "ocr_math_MATH", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "ocr_math_TheoremQA", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_HatefulMemes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Misinformation_PolitiFact", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Misinformation_GossipCop", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Memotion", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "app_layout_understanding_leetcode", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_instagram", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_iphone_settings", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_ppt", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_notes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_amazon", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_excel", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_youtube", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_twitter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_alipay", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "app_layout_understanding_zoom", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "app_layout_understanding_word", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_tiktok", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "maze_2d_8x8", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_relation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_distance", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_depth", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_count", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "symbolic_graphics_programs_scalable_vector_graphics", + "score": 0.05555555555555555, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18 + }, + { + "name": "symbolic_graphics_programs_computer_aided_design", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_africa", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_europe", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_asia", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_americas", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ascii_art_30", + "score": 0.5714285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table2latex_complex", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "meme_explain", + "score": 0.29285714285714287, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "scibench_w_solution_open_ended", + "score": 0.038000000000000006, + "eval_type": "llm", + "num_demo": 1, + "num_query": 25 + }, + { + "name": "funny_image_title", + "score": 0.5285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "guess_image_generation_prompt", + "score": 0.5684210526315789, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "traffic_accident_analysis", + "score": 0.09999999999999999, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "image_humor_understanding", + "score": 0.1793103448275862, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "vibe-eval", + "score": 0.2928571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_interpretation", + "score": 0.11379310344827588, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "science_figure_explanation", + "score": 0.04137931034482759, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "ocrqa", + "score": 0.28965517241379307, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "electrocardiogram", + "score": 0.15714285714285717, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "GUI_Chat_Easy", + "score": 0.6461538461538462, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26 + }, + { + "name": "docci_image_description_long", + "score": 0.5142857142857141, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "humor_explanation", + "score": 0.16, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "GUI_Chat_Hard", + "score": 0.5290322580645161, + "eval_type": "llm", + "num_demo": 1, + "num_query": 31 + }, + { + "name": "iq_test", + "score": 0.25172413793103443, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "unusual_images", + "score": 0.1448275862068966, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "tweets_captioning", + "score": 0.4357142857142858, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "art_explanation", + "score": 0.09655172413793105, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "bar_chart_interpretation", + "score": 0.19310344827586207, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "figurative_speech_explanation", + "score": 0.14827586206896554, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "defeasible_reasoning", + "score": 0.1482758620689655, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "image_captioning_with_additional_requirements", + "score": 0.3642857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "visualization_with_code", + "score": 0.08571428571428572, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_fetaqa", + "score": 0.23571428571428577, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "red_teaming_jailbreak", + "score": 0.19000000000000003, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_celebrity", + "score": 0.8250000000000002, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_captcha", + "score": 0.14210526315789482, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "red_teaming_politics", + "score": 0.54, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_racial", + "score": 0.615, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_visualmisleading", + "score": 0.5842105263157894, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "bridge_strategies_worldclass", + "score": 0.05714285714285715, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bridge_strategies_advanced", + "score": 0.049999999999999996, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bridge_strategies_expert", + "score": 0.014285714285714287, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Spanish", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_English", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Russian", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Arabic", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Japanese", + "score": 0.049999999999999996, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_French", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + } +] \ No newline at end of file diff --git a/static/eval_results/SI/all_model_keywords_stats.json b/static/eval_results/SI/all_model_keywords_stats.json deleted file mode 100644 index 91397f9b11ec753969773236bd3e64dd1aee80e2..0000000000000000000000000000000000000000 --- a/static/eval_results/SI/all_model_keywords_stats.json +++ /dev/null @@ -1,5348 +0,0 @@ -{ - "Aquila_VL_2B": { - "app": { - "Coding": { - "count": 16, - "num_samples": 244, - "tasks": [], - "average_score": 0.23446107609710548 - }, - "Information_Extraction": { - "count": 41, - "num_samples": 644, - "tasks": [], - "average_score": 0.08500232938689507 - }, - "Knowledge": { - "count": 77, - "num_samples": 1294, - "tasks": [], - "average_score": 0.2736043135287443 - }, - "Mathematics": { - "count": 30, - "num_samples": 497, - "tasks": [], - "average_score": 0.19099680045595863 - }, - "Metrics": { - "count": 3, - "num_samples": 45, - "tasks": [], - "average_score": 0.39206349206349206 - }, - "Perception": { - "count": 82, - "num_samples": 1321, - "tasks": [], - "average_score": 0.3004030430829456 - }, - "Planning": { - "count": 44, - "num_samples": 714, - "tasks": [], - "average_score": 0.08421801129956362 - }, - "Science": { - "count": 22, - "num_samples": 469, - "tasks": [], - "average_score": 0.2897054521388083 - } - }, - "input_format": { - "3D Models and Aerial Imagery": { - "count": 2, - "num_samples": 30, - "tasks": [], - "average_score": 0.10279080594456047 - }, - "Artistic and Creative Content": { - "count": 22, - "num_samples": 389, - "tasks": [], - "average_score": 0.3078950207372175 - }, - "Diagrams and Data Visualizations": { - "count": 88, - "num_samples": 1524, - "tasks": [], - "average_score": 0.2248398559924241 - }, - "Photographs": { - "count": 83, - "num_samples": 1315, - "tasks": [], - "average_score": 0.3533180891172854 - }, - "Text-Based Images and Documents": { - "count": 53, - "num_samples": 847, - "tasks": [], - "average_score": 0.11430966292465267 - }, - "User Interface Screenshots": { - "count": 67, - "num_samples": 1123, - "tasks": [], - "average_score": 0.11601893140078427 - } - }, - "input_num": { - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.2219754327969366 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 63, - "num_samples": 975, - "tasks": [], - "average_score": 0.1772030496280578 - }, - "exact_text": { - "count": 57, - "num_samples": 880, - "tasks": [], - "average_score": 0.1884228017877996 - }, - "multiple_choice": { - "count": 33, - "num_samples": 567, - "tasks": [], - "average_score": 0.43875114784205704 - }, - "numerical_data": { - "count": 39, - "num_samples": 694, - "tasks": [], - "average_score": 0.23519563962981577 - }, - "open_ended_output": { - "count": 51, - "num_samples": 991, - "tasks": [], - "average_score": 0.28092356180071465 - }, - "structured_output": { - "count": 72, - "num_samples": 1121, - "tasks": [], - "average_score": 0.13944236147744013 - } - }, - "skills": { - "Commonsense and Social Reasoning": { - "count": 38, - "num_samples": 654, - "tasks": [], - "average_score": 0.3826225373137124 - }, - "Domain-Specific Knowledge and Skills": { - "count": 46, - "num_samples": 897, - "tasks": [], - "average_score": 0.20221672149607509 - }, - "Ethical and Safety Reasoning": { - "count": 10, - "num_samples": 170, - "tasks": [], - "average_score": 0.6020225563909773 - }, - "Language Understanding and Generation": { - "count": 102, - "num_samples": 1713, - "tasks": [], - "average_score": 0.2521179990443663 - }, - "Mathematical and Logical Reasoning": { - "count": 91, - "num_samples": 1630, - "tasks": [], - "average_score": 0.19504930283108274 - }, - "Object Recognition and Classification": { - "count": 172, - "num_samples": 2714, - "tasks": [], - "average_score": 0.2374462987863378 - }, - "Planning and Decision Making": { - "count": 23, - "num_samples": 356, - "tasks": [], - "average_score": 0.0625675073438388 - }, - "Scene and Event Understanding": { - "count": 60, - "num_samples": 1004, - "tasks": [], - "average_score": 0.3521969849344277 - }, - "Spatial and Temporal Reasoning": { - "count": 78, - "num_samples": 1273, - "tasks": [], - "average_score": 0.18502360430789122 - }, - "Text Recognition (OCR)": { - "count": 101, - "num_samples": 1687, - "tasks": [], - "average_score": 0.17480107496737848 - } - } - }, - "Aria": { - "app": { - "Coding": { - "count": 16, - "num_samples": 244, - "tasks": [], - "average_score": 0.38003253384687213 - }, - "Information_Extraction": { - "count": 41, - "num_samples": 644, - "tasks": [], - "average_score": 0.33746818901184633 - }, - "Knowledge": { - "count": 77, - "num_samples": 1294, - "tasks": [], - "average_score": 0.4097428531166082 - }, - "Mathematics": { - "count": 30, - "num_samples": 497, - "tasks": [], - "average_score": 0.22745674367681176 - }, - "Metrics": { - "count": 3, - "num_samples": 45, - "tasks": [], - "average_score": 0.4142857142857143 - }, - "Perception": { - "count": 82, - "num_samples": 1321, - "tasks": [], - "average_score": 0.4433718463877228 - }, - "Planning": { - "count": 44, - "num_samples": 714, - "tasks": [], - "average_score": 0.10860172719687727 - }, - "Science": { - "count": 22, - "num_samples": 469, - "tasks": [], - "average_score": 0.3496496998103286 - } - }, - "input_format": { - "3D Models and Aerial Imagery": { - "count": 2, - "num_samples": 30, - "tasks": [], - "average_score": 0.04960831797041802 - }, - "Artistic and Creative Content": { - "count": 22, - "num_samples": 389, - "tasks": [], - "average_score": 0.40912566596786665 - }, - "Diagrams and Data Visualizations": { - "count": 88, - "num_samples": 1524, - "tasks": [], - "average_score": 0.3300885226603808 - }, - "Photographs": { - "count": 83, - "num_samples": 1315, - "tasks": [], - "average_score": 0.45572004760273754 - }, - "Text-Based Images and Documents": { - "count": 53, - "num_samples": 847, - "tasks": [], - "average_score": 0.259572791833904 - }, - "User Interface Screenshots": { - "count": 67, - "num_samples": 1123, - "tasks": [], - "average_score": 0.27807228404309764 - } - }, - "input_num": { - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.3440023372395526 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 63, - "num_samples": 975, - "tasks": [], - "average_score": 0.3053148323646246 - }, - "exact_text": { - "count": 57, - "num_samples": 880, - "tasks": [], - "average_score": 0.2579833154471113 - }, - "multiple_choice": { - "count": 33, - "num_samples": 567, - "tasks": [], - "average_score": 0.4787572696663607 - }, - "numerical_data": { - "count": 39, - "num_samples": 694, - "tasks": [], - "average_score": 0.3082165471908181 - }, - "open_ended_output": { - "count": 51, - "num_samples": 991, - "tasks": [], - "average_score": 0.45805038774421686 - }, - "structured_output": { - "count": 72, - "num_samples": 1121, - "tasks": [], - "average_score": 0.3227895527307711 - } - }, - "skills": { - "Commonsense and Social Reasoning": { - "count": 38, - "num_samples": 654, - "tasks": [], - "average_score": 0.5240018518464876 - }, - "Domain-Specific Knowledge and Skills": { - "count": 46, - "num_samples": 897, - "tasks": [], - "average_score": 0.3401734439719901 - }, - "Ethical and Safety Reasoning": { - "count": 10, - "num_samples": 170, - "tasks": [], - "average_score": 0.7129097744360902 - }, - "Language Understanding and Generation": { - "count": 102, - "num_samples": 1713, - "tasks": [], - "average_score": 0.40684369400912745 - }, - "Mathematical and Logical Reasoning": { - "count": 91, - "num_samples": 1630, - "tasks": [], - "average_score": 0.300830802045758 - }, - "Object Recognition and Classification": { - "count": 172, - "num_samples": 2714, - "tasks": [], - "average_score": 0.33433893000455434 - }, - "Planning and Decision Making": { - "count": 23, - "num_samples": 356, - "tasks": [], - "average_score": 0.07560632809892315 - }, - "Scene and Event Understanding": { - "count": 60, - "num_samples": 1004, - "tasks": [], - "average_score": 0.49083567506460973 - }, - "Spatial and Temporal Reasoning": { - "count": 78, - "num_samples": 1273, - "tasks": [], - "average_score": 0.22595636868728874 - }, - "Text Recognition (OCR)": { - "count": 101, - "num_samples": 1687, - "tasks": [], - "average_score": 0.3653361644690575 - } - } - }, - "Claude_3.5": { - "app": { - "Coding": { - "count": 16, - "num_samples": 244, - "tasks": [], - "average_score": 0.6124985410830999 - }, - "Information_Extraction": { - "count": 41, - "num_samples": 644, - "tasks": [], - "average_score": 0.6692574633083122 - }, - "Knowledge": { - "count": 77, - "num_samples": 1294, - "tasks": [], - "average_score": 0.5401030980230185 - }, - "Mathematics": { - "count": 30, - "num_samples": 497, - "tasks": [], - "average_score": 0.4760293511799448 - }, - "Metrics": { - "count": 3, - "num_samples": 45, - "tasks": [], - "average_score": 0.4174603174603175 - }, - "Perception": { - "count": 82, - "num_samples": 1321, - "tasks": [], - "average_score": 0.6061759059165749 - }, - "Planning": { - "count": 44, - "num_samples": 714, - "tasks": [], - "average_score": 0.315623741632974 - }, - "Science": { - "count": 22, - "num_samples": 469, - "tasks": [], - "average_score": 0.5134329832846579 - } - }, - "input_format": { - "3D Models and Aerial Imagery": { - "count": 2, - "num_samples": 30, - "tasks": [], - "average_score": 0.34512576094802216 - }, - "Artistic and Creative Content": { - "count": 22, - "num_samples": 389, - "tasks": [], - "average_score": 0.6014068374421209 - }, - "Diagrams and Data Visualizations": { - "count": 88, - "num_samples": 1524, - "tasks": [], - "average_score": 0.5589506892621444 - }, - "Photographs": { - "count": 83, - "num_samples": 1315, - "tasks": [], - "average_score": 0.5314705050989759 - }, - "Text-Based Images and Documents": { - "count": 53, - "num_samples": 847, - "tasks": [], - "average_score": 0.4753194125515341 - }, - "User Interface Screenshots": { - "count": 67, - "num_samples": 1123, - "tasks": [], - "average_score": 0.54981020669637 - } - }, - "input_num": { - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.5373019912310933 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 63, - "num_samples": 975, - "tasks": [], - "average_score": 0.5072889926389097 - }, - "exact_text": { - "count": 57, - "num_samples": 880, - "tasks": [], - "average_score": 0.5112348724553849 - }, - "multiple_choice": { - "count": 33, - "num_samples": 567, - "tasks": [], - "average_score": 0.6164633346451529 - }, - "numerical_data": { - "count": 39, - "num_samples": 694, - "tasks": [], - "average_score": 0.4712835541311676 - }, - "open_ended_output": { - "count": 51, - "num_samples": 991, - "tasks": [], - "average_score": 0.5769294912151234 - }, - "structured_output": { - "count": 72, - "num_samples": 1121, - "tasks": [], - "average_score": 0.5556080592390198 - } - }, - "skills": { - "Commonsense and Social Reasoning": { - "count": 38, - "num_samples": 654, - "tasks": [], - "average_score": 0.6017116084931068 - }, - "Domain-Specific Knowledge and Skills": { - "count": 46, - "num_samples": 897, - "tasks": [], - "average_score": 0.530309401925396 - }, - "Ethical and Safety Reasoning": { - "count": 10, - "num_samples": 170, - "tasks": [], - "average_score": 0.7033233082706767 - }, - "Language Understanding and Generation": { - "count": 102, - "num_samples": 1713, - "tasks": [], - "average_score": 0.5757222503903228 - }, - "Mathematical and Logical Reasoning": { - "count": 91, - "num_samples": 1630, - "tasks": [], - "average_score": 0.5044379729567133 - }, - "Object Recognition and Classification": { - "count": 172, - "num_samples": 2714, - "tasks": [], - "average_score": 0.5499261524919171 - }, - "Planning and Decision Making": { - "count": 23, - "num_samples": 356, - "tasks": [], - "average_score": 0.19196633042767672 - }, - "Scene and Event Understanding": { - "count": 60, - "num_samples": 1004, - "tasks": [], - "average_score": 0.636886763741019 - }, - "Spatial and Temporal Reasoning": { - "count": 78, - "num_samples": 1273, - "tasks": [], - "average_score": 0.4511182385296208 - }, - "Text Recognition (OCR)": { - "count": 101, - "num_samples": 1687, - "tasks": [], - "average_score": 0.6192941518442948 - } - } - }, - "Claude_3.5_new": { - "app": { - "Coding": { - "count": 16, - "num_samples": 244, - "tasks": [], - "average_score": 0.565344887955182 - }, - "Information_Extraction": { - "count": 41, - "num_samples": 644, - "tasks": [], - "average_score": 0.6633000290867174 - }, - "Knowledge": { - "count": 77, - "num_samples": 1294, - "tasks": [], - "average_score": 0.5737128945237007 - }, - "Mathematics": { - "count": 30, - "num_samples": 497, - "tasks": [], - "average_score": 0.4831956110227109 - }, - "Metrics": { - "count": 3, - "num_samples": 45, - "tasks": [], - "average_score": 0.6285714285714286 - }, - "Perception": { - "count": 82, - "num_samples": 1321, - "tasks": [], - "average_score": 0.6465631513465354 - }, - "Planning": { - "count": 44, - "num_samples": 714, - "tasks": [], - "average_score": 0.3511145464456188 - }, - "Science": { - "count": 22, - "num_samples": 469, - "tasks": [], - "average_score": 0.5580232103280633 - } - }, - "input_format": { - "3D Models and Aerial Imagery": { - "count": 2, - "num_samples": 30, - "tasks": [], - "average_score": 0.3619606028475468 - }, - "Artistic and Creative Content": { - "count": 22, - "num_samples": 389, - "tasks": [], - "average_score": 0.5927094432064197 - }, - "Diagrams and Data Visualizations": { - "count": 88, - "num_samples": 1524, - "tasks": [], - "average_score": 0.5899091882733952 - }, - "Photographs": { - "count": 83, - "num_samples": 1315, - "tasks": [], - "average_score": 0.5838312144672865 - }, - "Text-Based Images and Documents": { - "count": 53, - "num_samples": 847, - "tasks": [], - "average_score": 0.4705509892153899 - }, - "User Interface Screenshots": { - "count": 67, - "num_samples": 1123, - "tasks": [], - "average_score": 0.574168555556774 - } - }, - "input_num": { - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.5636254729390459 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 63, - "num_samples": 975, - "tasks": [], - "average_score": 0.5249488326690246 - }, - "exact_text": { - "count": 57, - "num_samples": 880, - "tasks": [], - "average_score": 0.5300876558354416 - }, - "multiple_choice": { - "count": 33, - "num_samples": 567, - "tasks": [], - "average_score": 0.6380252743889108 - }, - "numerical_data": { - "count": 39, - "num_samples": 694, - "tasks": [], - "average_score": 0.5106873710119535 - }, - "open_ended_output": { - "count": 51, - "num_samples": 991, - "tasks": [], - "average_score": 0.6409616762702612 - }, - "structured_output": { - "count": 72, - "num_samples": 1121, - "tasks": [], - "average_score": 0.5638133905687104 - } - }, - "skills": { - "Commonsense and Social Reasoning": { - "count": 38, - "num_samples": 654, - "tasks": [], - "average_score": 0.6433122980573076 - }, - "Domain-Specific Knowledge and Skills": { - "count": 46, - "num_samples": 897, - "tasks": [], - "average_score": 0.5426169039575065 - }, - "Ethical and Safety Reasoning": { - "count": 10, - "num_samples": 170, - "tasks": [], - "average_score": 0.6839924812030076 - }, - "Language Understanding and Generation": { - "count": 102, - "num_samples": 1713, - "tasks": [], - "average_score": 0.6234123112506059 - }, - "Mathematical and Logical Reasoning": { - "count": 91, - "num_samples": 1630, - "tasks": [], - "average_score": 0.5171075478248572 - }, - "Object Recognition and Classification": { - "count": 172, - "num_samples": 2714, - "tasks": [], - "average_score": 0.583387314927874 - }, - "Planning and Decision Making": { - "count": 23, - "num_samples": 356, - "tasks": [], - "average_score": 0.22440221381985706 - }, - "Scene and Event Understanding": { - "count": 60, - "num_samples": 1004, - "tasks": [], - "average_score": 0.6507240054983652 - }, - "Spatial and Temporal Reasoning": { - "count": 78, - "num_samples": 1273, - "tasks": [], - "average_score": 0.48795977188332873 - }, - "Text Recognition (OCR)": { - "count": 101, - "num_samples": 1687, - "tasks": [], - "average_score": 0.6242355223474262 - } - } - }, - "GPT_4o": { - "app": { - "Coding": { - "count": 16, - "num_samples": 244, - "tasks": [], - "average_score": 0.5785991479925302 - }, - "Information_Extraction": { - "count": 41, - "num_samples": 644, - "tasks": [], - "average_score": 0.7387886231372116 - }, - "Knowledge": { - "count": 77, - "num_samples": 1294, - "tasks": [], - "average_score": 0.6073751328612617 - }, - "Mathematics": { - "count": 30, - "num_samples": 497, - "tasks": [], - "average_score": 0.4387500704123191 - }, - "Metrics": { - "count": 3, - "num_samples": 45, - "tasks": [], - "average_score": 0.626984126984127 - }, - "Perception": { - "count": 82, - "num_samples": 1321, - "tasks": [], - "average_score": 0.6418126331560571 - }, - "Planning": { - "count": 44, - "num_samples": 714, - "tasks": [], - "average_score": 0.302146719713088 - }, - "Science": { - "count": 22, - "num_samples": 469, - "tasks": [], - "average_score": 0.5184702350129554 - } - }, - "input_format": { - "3D Models and Aerial Imagery": { - "count": 2, - "num_samples": 30, - "tasks": [], - "average_score": 0.3427989299648589 - }, - "Artistic and Creative Content": { - "count": 22, - "num_samples": 389, - "tasks": [], - "average_score": 0.6086090683867454 - }, - "Diagrams and Data Visualizations": { - "count": 88, - "num_samples": 1524, - "tasks": [], - "average_score": 0.533172482404735 - }, - "Photographs": { - "count": 83, - "num_samples": 1315, - "tasks": [], - "average_score": 0.6107746700730057 - }, - "Text-Based Images and Documents": { - "count": 53, - "num_samples": 847, - "tasks": [], - "average_score": 0.4938444672052553 - }, - "User Interface Screenshots": { - "count": 67, - "num_samples": 1123, - "tasks": [], - "average_score": 0.6093502418300007 - } - }, - "input_num": { - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.5672657028463585 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 63, - "num_samples": 975, - "tasks": [], - "average_score": 0.5351259728352326 - }, - "exact_text": { - "count": 57, - "num_samples": 880, - "tasks": [], - "average_score": 0.6016521462358102 - }, - "multiple_choice": { - "count": 33, - "num_samples": 567, - "tasks": [], - "average_score": 0.6204512659058113 - }, - "numerical_data": { - "count": 39, - "num_samples": 694, - "tasks": [], - "average_score": 0.4632537848154335 - }, - "open_ended_output": { - "count": 51, - "num_samples": 991, - "tasks": [], - "average_score": 0.6563556079088679 - }, - "structured_output": { - "count": 72, - "num_samples": 1121, - "tasks": [], - "average_score": 0.5370230887013343 - } - }, - "skills": { - "Commonsense and Social Reasoning": { - "count": 38, - "num_samples": 654, - "tasks": [], - "average_score": 0.6716375018861761 - }, - "Domain-Specific Knowledge and Skills": { - "count": 46, - "num_samples": 897, - "tasks": [], - "average_score": 0.5506629280904943 - }, - "Ethical and Safety Reasoning": { - "count": 10, - "num_samples": 170, - "tasks": [], - "average_score": 0.7342894736842105 - }, - "Language Understanding and Generation": { - "count": 102, - "num_samples": 1713, - "tasks": [], - "average_score": 0.6512174145248227 - }, - "Mathematical and Logical Reasoning": { - "count": 91, - "num_samples": 1630, - "tasks": [], - "average_score": 0.47164342831848766 - }, - "Object Recognition and Classification": { - "count": 172, - "num_samples": 2714, - "tasks": [], - "average_score": 0.5798789532163023 - }, - "Planning and Decision Making": { - "count": 23, - "num_samples": 356, - "tasks": [], - "average_score": 0.1970421212123289 - }, - "Scene and Event Understanding": { - "count": 60, - "num_samples": 1004, - "tasks": [], - "average_score": 0.6933181759121947 - }, - "Spatial and Temporal Reasoning": { - "count": 78, - "num_samples": 1273, - "tasks": [], - "average_score": 0.4267383416112408 - }, - "Text Recognition (OCR)": { - "count": 101, - "num_samples": 1687, - "tasks": [], - "average_score": 0.6400436962819274 - } - } - }, - "GPT_4o_mini": { - "app": { - "Coding": { - "count": 16, - "num_samples": 244, - "tasks": [], - "average_score": 0.4556095354808589 - }, - "Information_Extraction": { - "count": 41, - "num_samples": 644, - "tasks": [], - "average_score": 0.5484747566251307 - }, - "Knowledge": { - "count": 77, - "num_samples": 1294, - "tasks": [], - "average_score": 0.535145025177205 - }, - "Mathematics": { - "count": 30, - "num_samples": 497, - "tasks": [], - "average_score": 0.33759329198549914 - }, - "Metrics": { - "count": 3, - "num_samples": 45, - "tasks": [], - "average_score": 0.4873015873015873 - }, - "Perception": { - "count": 82, - "num_samples": 1321, - "tasks": [], - "average_score": 0.5437015929631214 - }, - "Planning": { - "count": 44, - "num_samples": 714, - "tasks": [], - "average_score": 0.22983305008250185 - }, - "Science": { - "count": 22, - "num_samples": 469, - "tasks": [], - "average_score": 0.39601047285667923 - } - }, - "input_format": { - "3D Models and Aerial Imagery": { - "count": 2, - "num_samples": 30, - "tasks": [], - "average_score": 0.1194248916897328 - }, - "Artistic and Creative Content": { - "count": 22, - "num_samples": 389, - "tasks": [], - "average_score": 0.5198662454862603 - }, - "Diagrams and Data Visualizations": { - "count": 88, - "num_samples": 1524, - "tasks": [], - "average_score": 0.4194828137611333 - }, - "Photographs": { - "count": 83, - "num_samples": 1315, - "tasks": [], - "average_score": 0.5569877095654321 - }, - "Text-Based Images and Documents": { - "count": 53, - "num_samples": 847, - "tasks": [], - "average_score": 0.3779902828155749 - }, - "User Interface Screenshots": { - "count": 67, - "num_samples": 1123, - "tasks": [], - "average_score": 0.4645916955325127 - } - }, - "input_num": { - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.46343334374251277 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 63, - "num_samples": 975, - "tasks": [], - "average_score": 0.38282644938937405 - }, - "exact_text": { - "count": 57, - "num_samples": 880, - "tasks": [], - "average_score": 0.42048902061937554 - }, - "multiple_choice": { - "count": 33, - "num_samples": 567, - "tasks": [], - "average_score": 0.5559184922821285 - }, - "numerical_data": { - "count": 39, - "num_samples": 694, - "tasks": [], - "average_score": 0.3777213713726476 - }, - "open_ended_output": { - "count": 51, - "num_samples": 991, - "tasks": [], - "average_score": 0.5986898724975707 - }, - "structured_output": { - "count": 72, - "num_samples": 1121, - "tasks": [], - "average_score": 0.4761935495255144 - } - }, - "skills": { - "Commonsense and Social Reasoning": { - "count": 38, - "num_samples": 654, - "tasks": [], - "average_score": 0.5775026308600164 - }, - "Domain-Specific Knowledge and Skills": { - "count": 46, - "num_samples": 897, - "tasks": [], - "average_score": 0.4555977624507237 - }, - "Ethical and Safety Reasoning": { - "count": 10, - "num_samples": 170, - "tasks": [], - "average_score": 0.7960714285714285 - }, - "Language Understanding and Generation": { - "count": 102, - "num_samples": 1713, - "tasks": [], - "average_score": 0.5458509360302554 - }, - "Mathematical and Logical Reasoning": { - "count": 91, - "num_samples": 1630, - "tasks": [], - "average_score": 0.37680368570252215 - }, - "Object Recognition and Classification": { - "count": 172, - "num_samples": 2714, - "tasks": [], - "average_score": 0.48241878593503174 - }, - "Planning and Decision Making": { - "count": 23, - "num_samples": 356, - "tasks": [], - "average_score": 0.17294565844175996 - }, - "Scene and Event Understanding": { - "count": 60, - "num_samples": 1004, - "tasks": [], - "average_score": 0.5987052352447554 - }, - "Spatial and Temporal Reasoning": { - "count": 78, - "num_samples": 1273, - "tasks": [], - "average_score": 0.33277278942510824 - }, - "Text Recognition (OCR)": { - "count": 101, - "num_samples": 1687, - "tasks": [], - "average_score": 0.503118803606002 - } - } - }, - "Gemini_1.5_flash_002": { - "app": { - "Coding": { - "count": 16, - "num_samples": 244, - "tasks": [], - "average_score": 0.47487599206349207 - }, - "Information_Extraction": { - "count": 41, - "num_samples": 644, - "tasks": [], - "average_score": 0.45245079667466714 - }, - "Knowledge": { - "count": 77, - "num_samples": 1294, - "tasks": [], - "average_score": 0.5086518140501541 - }, - "Mathematics": { - "count": 30, - "num_samples": 497, - "tasks": [], - "average_score": 0.3853815223607656 - }, - "Metrics": { - "count": 3, - "num_samples": 45, - "tasks": [], - "average_score": 0.4380952380952381 - }, - "Perception": { - "count": 82, - "num_samples": 1321, - "tasks": [], - "average_score": 0.5468998820129136 - }, - "Planning": { - "count": 44, - "num_samples": 714, - "tasks": [], - "average_score": 0.21148887498941377 - }, - "Science": { - "count": 22, - "num_samples": 469, - "tasks": [], - "average_score": 0.48499051643275837 - } - }, - "input_format": { - "3D Models and Aerial Imagery": { - "count": 2, - "num_samples": 30, - "tasks": [], - "average_score": 0.3348446026637953 - }, - "Artistic and Creative Content": { - "count": 22, - "num_samples": 389, - "tasks": [], - "average_score": 0.5535202379362348 - }, - "Diagrams and Data Visualizations": { - "count": 88, - "num_samples": 1524, - "tasks": [], - "average_score": 0.46724590271207767 - }, - "Photographs": { - "count": 83, - "num_samples": 1315, - "tasks": [], - "average_score": 0.5613400178213946 - }, - "Text-Based Images and Documents": { - "count": 53, - "num_samples": 847, - "tasks": [], - "average_score": 0.33052002642818507 - }, - "User Interface Screenshots": { - "count": 67, - "num_samples": 1123, - "tasks": [], - "average_score": 0.3722082840493195 - } - }, - "input_num": { - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.45400479933257654 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 63, - "num_samples": 975, - "tasks": [], - "average_score": 0.3691249729531883 - }, - "exact_text": { - "count": 57, - "num_samples": 880, - "tasks": [], - "average_score": 0.42013434507914493 - }, - "multiple_choice": { - "count": 33, - "num_samples": 567, - "tasks": [], - "average_score": 0.5905636451090996 - }, - "numerical_data": { - "count": 39, - "num_samples": 694, - "tasks": [], - "average_score": 0.43247267273235235 - }, - "open_ended_output": { - "count": 51, - "num_samples": 991, - "tasks": [], - "average_score": 0.5470781816319514 - }, - "structured_output": { - "count": 72, - "num_samples": 1121, - "tasks": [], - "average_score": 0.43823554216399857 - } - }, - "skills": { - "Commonsense and Social Reasoning": { - "count": 38, - "num_samples": 654, - "tasks": [], - "average_score": 0.5955368143490581 - }, - "Domain-Specific Knowledge and Skills": { - "count": 46, - "num_samples": 897, - "tasks": [], - "average_score": 0.4655431430975485 - }, - "Ethical and Safety Reasoning": { - "count": 10, - "num_samples": 170, - "tasks": [], - "average_score": 0.7948947368421052 - }, - "Language Understanding and Generation": { - "count": 102, - "num_samples": 1713, - "tasks": [], - "average_score": 0.5122400421391089 - }, - "Mathematical and Logical Reasoning": { - "count": 91, - "num_samples": 1630, - "tasks": [], - "average_score": 0.4086167264646781 - }, - "Object Recognition and Classification": { - "count": 172, - "num_samples": 2714, - "tasks": [], - "average_score": 0.47630441828533016 - }, - "Planning and Decision Making": { - "count": 23, - "num_samples": 356, - "tasks": [], - "average_score": 0.09741974015331743 - }, - "Scene and Event Understanding": { - "count": 60, - "num_samples": 1004, - "tasks": [], - "average_score": 0.5920539115535787 - }, - "Spatial and Temporal Reasoning": { - "count": 78, - "num_samples": 1273, - "tasks": [], - "average_score": 0.3559690476405975 - }, - "Text Recognition (OCR)": { - "count": 101, - "num_samples": 1687, - "tasks": [], - "average_score": 0.4474763430506795 - } - } - }, - "Gemini_1.5_pro_002": { - "app": { - "Coding": { - "count": 16, - "num_samples": 244, - "tasks": [], - "average_score": 0.48587549603174607 - }, - "Information_Extraction": { - "count": 41, - "num_samples": 644, - "tasks": [], - "average_score": 0.504539358390968 - }, - "Knowledge": { - "count": 77, - "num_samples": 1294, - "tasks": [], - "average_score": 0.5660366627264668 - }, - "Mathematics": { - "count": 30, - "num_samples": 497, - "tasks": [], - "average_score": 0.4200866579901879 - }, - "Metrics": { - "count": 3, - "num_samples": 45, - "tasks": [], - "average_score": 0.48888888888888893 - }, - "Perception": { - "count": 82, - "num_samples": 1321, - "tasks": [], - "average_score": 0.5964613809728712 - }, - "Planning": { - "count": 44, - "num_samples": 714, - "tasks": [], - "average_score": 0.28536490696494377 - }, - "Science": { - "count": 22, - "num_samples": 469, - "tasks": [], - "average_score": 0.500158537824293 - } - }, - "input_format": { - "3D Models and Aerial Imagery": { - "count": 2, - "num_samples": 30, - "tasks": [], - "average_score": 0.3592697030984118 - }, - "Artistic and Creative Content": { - "count": 22, - "num_samples": 389, - "tasks": [], - "average_score": 0.6217290675275775 - }, - "Diagrams and Data Visualizations": { - "count": 88, - "num_samples": 1524, - "tasks": [], - "average_score": 0.5132563067393096 - }, - "Photographs": { - "count": 83, - "num_samples": 1315, - "tasks": [], - "average_score": 0.5888558035357285 - }, - "Text-Based Images and Documents": { - "count": 53, - "num_samples": 847, - "tasks": [], - "average_score": 0.4060403716629095 - }, - "User Interface Screenshots": { - "count": 67, - "num_samples": 1123, - "tasks": [], - "average_score": 0.42724302639929596 - } - }, - "input_num": { - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.5034399620483027 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 63, - "num_samples": 975, - "tasks": [], - "average_score": 0.43754003302746525 - }, - "exact_text": { - "count": 57, - "num_samples": 880, - "tasks": [], - "average_score": 0.4731762319443037 - }, - "multiple_choice": { - "count": 33, - "num_samples": 567, - "tasks": [], - "average_score": 0.6245091608727974 - }, - "numerical_data": { - "count": 39, - "num_samples": 694, - "tasks": [], - "average_score": 0.48334866543174226 - }, - "open_ended_output": { - "count": 51, - "num_samples": 991, - "tasks": [], - "average_score": 0.5644701189535662 - }, - "structured_output": { - "count": 72, - "num_samples": 1121, - "tasks": [], - "average_score": 0.4972242280053817 - } - }, - "skills": { - "Commonsense and Social Reasoning": { - "count": 38, - "num_samples": 654, - "tasks": [], - "average_score": 0.5995804836966744 - }, - "Domain-Specific Knowledge and Skills": { - "count": 46, - "num_samples": 897, - "tasks": [], - "average_score": 0.5090111123207751 - }, - "Ethical and Safety Reasoning": { - "count": 10, - "num_samples": 170, - "tasks": [], - "average_score": 0.7830639097744362 - }, - "Language Understanding and Generation": { - "count": 102, - "num_samples": 1713, - "tasks": [], - "average_score": 0.5647567827649111 - }, - "Mathematical and Logical Reasoning": { - "count": 91, - "num_samples": 1630, - "tasks": [], - "average_score": 0.448099634405986 - }, - "Object Recognition and Classification": { - "count": 172, - "num_samples": 2714, - "tasks": [], - "average_score": 0.5220033468415737 - }, - "Planning and Decision Making": { - "count": 23, - "num_samples": 356, - "tasks": [], - "average_score": 0.178032259819607 - }, - "Scene and Event Understanding": { - "count": 60, - "num_samples": 1004, - "tasks": [], - "average_score": 0.6342882147970302 - }, - "Spatial and Temporal Reasoning": { - "count": 78, - "num_samples": 1273, - "tasks": [], - "average_score": 0.3972807544005462 - }, - "Text Recognition (OCR)": { - "count": 101, - "num_samples": 1687, - "tasks": [], - "average_score": 0.5000257619938475 - } - } - }, - "Idefics3": { - "app": { - "Coding": { - "count": 16, - "num_samples": 244, - "tasks": [], - "average_score": 0.10420386904761905 - }, - "Information_Extraction": { - "count": 41, - "num_samples": 644, - "tasks": [], - "average_score": 0.03610947192711297 - }, - "Knowledge": { - "count": 77, - "num_samples": 1294, - "tasks": [], - "average_score": 0.14759816564804443 - }, - "Mathematics": { - "count": 30, - "num_samples": 497, - "tasks": [], - "average_score": 0.07952603609985566 - }, - "Metrics": { - "count": 3, - "num_samples": 45, - "tasks": [], - "average_score": 0.19999999999999998 - }, - "Perception": { - "count": 82, - "num_samples": 1321, - "tasks": [], - "average_score": 0.17708549842279478 - }, - "Planning": { - "count": 44, - "num_samples": 714, - "tasks": [], - "average_score": 0.04525221984520586 - }, - "Science": { - "count": 22, - "num_samples": 469, - "tasks": [], - "average_score": 0.1804888391778344 - } - }, - "input_format": { - "3D Models and Aerial Imagery": { - "count": 2, - "num_samples": 30, - "tasks": [], - "average_score": 0.020659062938075456 - }, - "Artistic and Creative Content": { - "count": 22, - "num_samples": 389, - "tasks": [], - "average_score": 0.21050154891192577 - }, - "Diagrams and Data Visualizations": { - "count": 88, - "num_samples": 1524, - "tasks": [], - "average_score": 0.14766910173600153 - }, - "Photographs": { - "count": 83, - "num_samples": 1315, - "tasks": [], - "average_score": 0.171712228743858 - }, - "Text-Based Images and Documents": { - "count": 53, - "num_samples": 847, - "tasks": [], - "average_score": 0.06561871098996794 - }, - "User Interface Screenshots": { - "count": 67, - "num_samples": 1123, - "tasks": [], - "average_score": 0.03857183991826921 - } - }, - "input_num": { - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.12057604157917215 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 63, - "num_samples": 975, - "tasks": [], - "average_score": 0.15091196450213815 - }, - "exact_text": { - "count": 57, - "num_samples": 880, - "tasks": [], - "average_score": 0.053829016986911726 - }, - "multiple_choice": { - "count": 33, - "num_samples": 567, - "tasks": [], - "average_score": 0.13726004635095543 - }, - "numerical_data": { - "count": 39, - "num_samples": 694, - "tasks": [], - "average_score": 0.10744987600153451 - }, - "open_ended_output": { - "count": 51, - "num_samples": 991, - "tasks": [], - "average_score": 0.2975217887286715 - }, - "structured_output": { - "count": 72, - "num_samples": 1121, - "tasks": [], - "average_score": 0.02100010342704044 - } - }, - "skills": { - "Commonsense and Social Reasoning": { - "count": 38, - "num_samples": 654, - "tasks": [], - "average_score": 0.2126465842330819 - }, - "Domain-Specific Knowledge and Skills": { - "count": 46, - "num_samples": 897, - "tasks": [], - "average_score": 0.1166739111764397 - }, - "Ethical and Safety Reasoning": { - "count": 10, - "num_samples": 170, - "tasks": [], - "average_score": 0.2774436090225564 - }, - "Language Understanding and Generation": { - "count": 102, - "num_samples": 1713, - "tasks": [], - "average_score": 0.1724799353848912 - }, - "Mathematical and Logical Reasoning": { - "count": 91, - "num_samples": 1630, - "tasks": [], - "average_score": 0.1275512898313342 - }, - "Object Recognition and Classification": { - "count": 172, - "num_samples": 2714, - "tasks": [], - "average_score": 0.12579260798514427 - }, - "Planning and Decision Making": { - "count": 23, - "num_samples": 356, - "tasks": [], - "average_score": 0.014803312629399587 - }, - "Scene and Event Understanding": { - "count": 60, - "num_samples": 1004, - "tasks": [], - "average_score": 0.15897902615904647 - }, - "Spatial and Temporal Reasoning": { - "count": 78, - "num_samples": 1273, - "tasks": [], - "average_score": 0.09276606649010487 - }, - "Text Recognition (OCR)": { - "count": 101, - "num_samples": 1687, - "tasks": [], - "average_score": 0.07893017100109866 - } - } - }, - "InternVL2_2B": { - "app": { - "Coding": { - "count": 16, - "num_samples": 244, - "tasks": [], - "average_score": 0.09082268323996265 - }, - "Information_Extraction": { - "count": 41, - "num_samples": 644, - "tasks": [], - "average_score": 0.03678177133215256 - }, - "Knowledge": { - "count": 77, - "num_samples": 1294, - "tasks": [], - "average_score": 0.20753533217719797 - }, - "Mathematics": { - "count": 30, - "num_samples": 497, - "tasks": [], - "average_score": 0.12084183290294437 - }, - "Metrics": { - "count": 3, - "num_samples": 45, - "tasks": [], - "average_score": 0.3428571428571428 - }, - "Perception": { - "count": 82, - "num_samples": 1321, - "tasks": [], - "average_score": 0.19769593666817548 - }, - "Planning": { - "count": 44, - "num_samples": 714, - "tasks": [], - "average_score": 0.039950835968771276 - }, - "Science": { - "count": 22, - "num_samples": 469, - "tasks": [], - "average_score": 0.15289272275533383 - } - }, - "input_format": { - "3D Models and Aerial Imagery": { - "count": 2, - "num_samples": 30, - "tasks": [], - "average_score": 0.07184873949579831 - }, - "Artistic and Creative Content": { - "count": 22, - "num_samples": 389, - "tasks": [], - "average_score": 0.18693717087010792 - }, - "Diagrams and Data Visualizations": { - "count": 88, - "num_samples": 1524, - "tasks": [], - "average_score": 0.15159509081542988 - }, - "Photographs": { - "count": 83, - "num_samples": 1315, - "tasks": [], - "average_score": 0.22923075081716637 - }, - "Text-Based Images and Documents": { - "count": 53, - "num_samples": 847, - "tasks": [], - "average_score": 0.09447908809124074 - }, - "User Interface Screenshots": { - "count": 67, - "num_samples": 1123, - "tasks": [], - "average_score": 0.049217594376760605 - } - }, - "input_num": { - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.14262795568189013 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 63, - "num_samples": 975, - "tasks": [], - "average_score": 0.12369372450210245 - }, - "exact_text": { - "count": 57, - "num_samples": 880, - "tasks": [], - "average_score": 0.11544832152620972 - }, - "multiple_choice": { - "count": 33, - "num_samples": 567, - "tasks": [], - "average_score": 0.3044601862783681 - }, - "numerical_data": { - "count": 39, - "num_samples": 694, - "tasks": [], - "average_score": 0.12291071957107838 - }, - "open_ended_output": { - "count": 51, - "num_samples": 991, - "tasks": [], - "average_score": 0.24746476545671045 - }, - "structured_output": { - "count": 72, - "num_samples": 1121, - "tasks": [], - "average_score": 0.042960275283590164 - } - }, - "skills": { - "Commonsense and Social Reasoning": { - "count": 38, - "num_samples": 654, - "tasks": [], - "average_score": 0.3035836752792625 - }, - "Domain-Specific Knowledge and Skills": { - "count": 46, - "num_samples": 897, - "tasks": [], - "average_score": 0.08201891993308255 - }, - "Ethical and Safety Reasoning": { - "count": 10, - "num_samples": 170, - "tasks": [], - "average_score": 0.4728533834586467 - }, - "Language Understanding and Generation": { - "count": 102, - "num_samples": 1713, - "tasks": [], - "average_score": 0.1905261989833371 - }, - "Mathematical and Logical Reasoning": { - "count": 91, - "num_samples": 1630, - "tasks": [], - "average_score": 0.1336101595652968 - }, - "Object Recognition and Classification": { - "count": 172, - "num_samples": 2714, - "tasks": [], - "average_score": 0.13333012698269087 - }, - "Planning and Decision Making": { - "count": 23, - "num_samples": 356, - "tasks": [], - "average_score": 0.013664596273291925 - }, - "Scene and Event Understanding": { - "count": 60, - "num_samples": 1004, - "tasks": [], - "average_score": 0.23055380602943532 - }, - "Spatial and Temporal Reasoning": { - "count": 78, - "num_samples": 1273, - "tasks": [], - "average_score": 0.11985812372011641 - }, - "Text Recognition (OCR)": { - "count": 101, - "num_samples": 1687, - "tasks": [], - "average_score": 0.12376971454228163 - } - } - }, - "InternVL2_76B": { - "app": { - "Coding": { - "count": 16, - "num_samples": 244, - "tasks": [], - "average_score": 0.42624956232493 - }, - "Information_Extraction": { - "count": 41, - "num_samples": 644, - "tasks": [], - "average_score": 0.4585598678029664 - }, - "Knowledge": { - "count": 77, - "num_samples": 1294, - "tasks": [], - "average_score": 0.47251288369387245 - }, - "Mathematics": { - "count": 30, - "num_samples": 497, - "tasks": [], - "average_score": 0.3075073077960568 - }, - "Metrics": { - "count": 3, - "num_samples": 45, - "tasks": [], - "average_score": 0.5301587301587302 - }, - "Perception": { - "count": 82, - "num_samples": 1321, - "tasks": [], - "average_score": 0.5361401478255164 - }, - "Planning": { - "count": 44, - "num_samples": 714, - "tasks": [], - "average_score": 0.1619462380866451 - }, - "Science": { - "count": 22, - "num_samples": 469, - "tasks": [], - "average_score": 0.38874625564304305 - } - }, - "input_format": { - "3D Models and Aerial Imagery": { - "count": 2, - "num_samples": 30, - "tasks": [], - "average_score": 0.30169355252977215 - }, - "Artistic and Creative Content": { - "count": 22, - "num_samples": 389, - "tasks": [], - "average_score": 0.509332186292545 - }, - "Diagrams and Data Visualizations": { - "count": 88, - "num_samples": 1524, - "tasks": [], - "average_score": 0.39253566766026804 - }, - "Photographs": { - "count": 83, - "num_samples": 1315, - "tasks": [], - "average_score": 0.5065289649268628 - }, - "Text-Based Images and Documents": { - "count": 53, - "num_samples": 847, - "tasks": [], - "average_score": 0.3333759749379774 - }, - "User Interface Screenshots": { - "count": 67, - "num_samples": 1123, - "tasks": [], - "average_score": 0.39401514252711556 - } - }, - "input_num": { - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.4205132675160581 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 63, - "num_samples": 975, - "tasks": [], - "average_score": 0.3863929410693585 - }, - "exact_text": { - "count": 57, - "num_samples": 880, - "tasks": [], - "average_score": 0.4041893680050902 - }, - "multiple_choice": { - "count": 33, - "num_samples": 567, - "tasks": [], - "average_score": 0.5389260571078752 - }, - "numerical_data": { - "count": 39, - "num_samples": 694, - "tasks": [], - "average_score": 0.34950523809271744 - }, - "open_ended_output": { - "count": 51, - "num_samples": 991, - "tasks": [], - "average_score": 0.48322911874283003 - }, - "structured_output": { - "count": 72, - "num_samples": 1121, - "tasks": [], - "average_score": 0.4030580663588658 - } - }, - "skills": { - "Commonsense and Social Reasoning": { - "count": 38, - "num_samples": 654, - "tasks": [], - "average_score": 0.5873606708191794 - }, - "Domain-Specific Knowledge and Skills": { - "count": 46, - "num_samples": 897, - "tasks": [], - "average_score": 0.37110860027855824 - }, - "Ethical and Safety Reasoning": { - "count": 10, - "num_samples": 170, - "tasks": [], - "average_score": 0.7041804511278196 - }, - "Language Understanding and Generation": { - "count": 102, - "num_samples": 1713, - "tasks": [], - "average_score": 0.470239452171767 - }, - "Mathematical and Logical Reasoning": { - "count": 91, - "num_samples": 1630, - "tasks": [], - "average_score": 0.3413715846680563 - }, - "Object Recognition and Classification": { - "count": 172, - "num_samples": 2714, - "tasks": [], - "average_score": 0.4230856844269695 - }, - "Planning and Decision Making": { - "count": 23, - "num_samples": 356, - "tasks": [], - "average_score": 0.10153556963007855 - }, - "Scene and Event Understanding": { - "count": 60, - "num_samples": 1004, - "tasks": [], - "average_score": 0.570666577587141 - }, - "Spatial and Temporal Reasoning": { - "count": 78, - "num_samples": 1273, - "tasks": [], - "average_score": 0.3276283897777921 - }, - "Text Recognition (OCR)": { - "count": 101, - "num_samples": 1687, - "tasks": [], - "average_score": 0.4672429826553732 - } - } - }, - "InternVL2_8B": { - "app": { - "Coding": { - "count": 16, - "num_samples": 244, - "tasks": [], - "average_score": 0.34736300770308126 - }, - "Information_Extraction": { - "count": 41, - "num_samples": 644, - "tasks": [], - "average_score": 0.25646898023629483 - }, - "Knowledge": { - "count": 77, - "num_samples": 1294, - "tasks": [], - "average_score": 0.34366199611891174 - }, - "Mathematics": { - "count": 30, - "num_samples": 497, - "tasks": [], - "average_score": 0.23531351908862871 - }, - "Metrics": { - "count": 3, - "num_samples": 45, - "tasks": [], - "average_score": 0.3253968253968254 - }, - "Perception": { - "count": 82, - "num_samples": 1321, - "tasks": [], - "average_score": 0.3784296942438538 - }, - "Planning": { - "count": 44, - "num_samples": 714, - "tasks": [], - "average_score": 0.09134825639389237 - }, - "Science": { - "count": 22, - "num_samples": 469, - "tasks": [], - "average_score": 0.2912783917684807 - } - }, - "input_format": { - "3D Models and Aerial Imagery": { - "count": 2, - "num_samples": 30, - "tasks": [], - "average_score": 0.0503849634147267 - }, - "Artistic and Creative Content": { - "count": 22, - "num_samples": 389, - "tasks": [], - "average_score": 0.34383350461121587 - }, - "Diagrams and Data Visualizations": { - "count": 88, - "num_samples": 1524, - "tasks": [], - "average_score": 0.27187498646061353 - }, - "Photographs": { - "count": 83, - "num_samples": 1315, - "tasks": [], - "average_score": 0.4088467630174509 - }, - "Text-Based Images and Documents": { - "count": 53, - "num_samples": 847, - "tasks": [], - "average_score": 0.21516421271234623 - }, - "User Interface Screenshots": { - "count": 67, - "num_samples": 1123, - "tasks": [], - "average_score": 0.22539102164423624 - } - }, - "input_num": { - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.29215647267040246 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 63, - "num_samples": 975, - "tasks": [], - "average_score": 0.25281668404704594 - }, - "exact_text": { - "count": 57, - "num_samples": 880, - "tasks": [], - "average_score": 0.2452385560845516 - }, - "multiple_choice": { - "count": 33, - "num_samples": 567, - "tasks": [], - "average_score": 0.4334863789409244 - }, - "numerical_data": { - "count": 39, - "num_samples": 694, - "tasks": [], - "average_score": 0.26248166960198344 - }, - "open_ended_output": { - "count": 51, - "num_samples": 991, - "tasks": [], - "average_score": 0.3417106670258814 - }, - "structured_output": { - "count": 72, - "num_samples": 1121, - "tasks": [], - "average_score": 0.27991889529924496 - } - }, - "skills": { - "Commonsense and Social Reasoning": { - "count": 38, - "num_samples": 654, - "tasks": [], - "average_score": 0.4403771552269444 - }, - "Domain-Specific Knowledge and Skills": { - "count": 46, - "num_samples": 897, - "tasks": [], - "average_score": 0.27396131593770284 - }, - "Ethical and Safety Reasoning": { - "count": 10, - "num_samples": 170, - "tasks": [], - "average_score": 0.6521729323308272 - }, - "Language Understanding and Generation": { - "count": 102, - "num_samples": 1713, - "tasks": [], - "average_score": 0.3284779417766259 - }, - "Mathematical and Logical Reasoning": { - "count": 91, - "num_samples": 1630, - "tasks": [], - "average_score": 0.24983605813271914 - }, - "Object Recognition and Classification": { - "count": 172, - "num_samples": 2714, - "tasks": [], - "average_score": 0.2915702951202482 - }, - "Planning and Decision Making": { - "count": 23, - "num_samples": 356, - "tasks": [], - "average_score": 0.0592961015994038 - }, - "Scene and Event Understanding": { - "count": 60, - "num_samples": 1004, - "tasks": [], - "average_score": 0.41603267498315427 - }, - "Spatial and Temporal Reasoning": { - "count": 78, - "num_samples": 1273, - "tasks": [], - "average_score": 0.21701915158341967 - }, - "Text Recognition (OCR)": { - "count": 101, - "num_samples": 1687, - "tasks": [], - "average_score": 0.30220279568886643 - } - } - }, - "Llama_3_2_11B": { - "app": { - "Coding": { - "count": 16, - "num_samples": 244, - "tasks": [], - "average_score": 0.14131944444444444 - }, - "Information_Extraction": { - "count": 41, - "num_samples": 644, - "tasks": [], - "average_score": 0.23423754995839735 - }, - "Knowledge": { - "count": 77, - "num_samples": 1294, - "tasks": [], - "average_score": 0.33493936008655223 - }, - "Mathematics": { - "count": 30, - "num_samples": 497, - "tasks": [], - "average_score": 0.12719796356144183 - }, - "Metrics": { - "count": 3, - "num_samples": 45, - "tasks": [], - "average_score": 0.22857142857142856 - }, - "Perception": { - "count": 82, - "num_samples": 1321, - "tasks": [], - "average_score": 0.2740778723883188 - }, - "Planning": { - "count": 44, - "num_samples": 714, - "tasks": [], - "average_score": 0.09595984705908096 - }, - "Science": { - "count": 22, - "num_samples": 469, - "tasks": [], - "average_score": 0.18716549835825297 - } - }, - "input_format": { - "3D Models and Aerial Imagery": { - "count": 2, - "num_samples": 30, - "tasks": [], - "average_score": 0.14822411270107955 - }, - "Artistic and Creative Content": { - "count": 22, - "num_samples": 389, - "tasks": [], - "average_score": 0.3275861238187186 - }, - "Diagrams and Data Visualizations": { - "count": 88, - "num_samples": 1524, - "tasks": [], - "average_score": 0.1970899659349296 - }, - "Photographs": { - "count": 83, - "num_samples": 1315, - "tasks": [], - "average_score": 0.3387317156024255 - }, - "Text-Based Images and Documents": { - "count": 53, - "num_samples": 847, - "tasks": [], - "average_score": 0.13775107230512224 - }, - "User Interface Screenshots": { - "count": 67, - "num_samples": 1123, - "tasks": [], - "average_score": 0.18967604731477847 - } - }, - "input_num": { - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.23165426777444673 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 63, - "num_samples": 975, - "tasks": [], - "average_score": 0.15123880546660726 - }, - "exact_text": { - "count": 57, - "num_samples": 880, - "tasks": [], - "average_score": 0.16571305203663964 - }, - "multiple_choice": { - "count": 33, - "num_samples": 567, - "tasks": [], - "average_score": 0.3762691853600945 - }, - "numerical_data": { - "count": 39, - "num_samples": 694, - "tasks": [], - "average_score": 0.16301171403498463 - }, - "open_ended_output": { - "count": 51, - "num_samples": 991, - "tasks": [], - "average_score": 0.34463240030392384 - }, - "structured_output": { - "count": 72, - "num_samples": 1121, - "tasks": [], - "average_score": 0.24509462859331077 - } - }, - "skills": { - "Commonsense and Social Reasoning": { - "count": 38, - "num_samples": 654, - "tasks": [], - "average_score": 0.39649168256429074 - }, - "Domain-Specific Knowledge and Skills": { - "count": 46, - "num_samples": 897, - "tasks": [], - "average_score": 0.21893599730050764 - }, - "Ethical and Safety Reasoning": { - "count": 10, - "num_samples": 170, - "tasks": [], - "average_score": 0.5728796992481204 - }, - "Language Understanding and Generation": { - "count": 102, - "num_samples": 1713, - "tasks": [], - "average_score": 0.25994005315432245 - }, - "Mathematical and Logical Reasoning": { - "count": 91, - "num_samples": 1630, - "tasks": [], - "average_score": 0.14653430680774066 - }, - "Object Recognition and Classification": { - "count": 172, - "num_samples": 2714, - "tasks": [], - "average_score": 0.2546845731733449 - }, - "Planning and Decision Making": { - "count": 23, - "num_samples": 356, - "tasks": [], - "average_score": 0.058403715092363084 - }, - "Scene and Event Understanding": { - "count": 60, - "num_samples": 1004, - "tasks": [], - "average_score": 0.37246318118748967 - }, - "Spatial and Temporal Reasoning": { - "count": 78, - "num_samples": 1273, - "tasks": [], - "average_score": 0.15806381880426276 - }, - "Text Recognition (OCR)": { - "count": 101, - "num_samples": 1687, - "tasks": [], - "average_score": 0.20716804318138016 - } - } - }, - "MiniCPM_v2.6": { - "app": { - "Coding": { - "count": 16, - "num_samples": 244, - "tasks": [], - "average_score": 0.20497125933706817 - }, - "Information_Extraction": { - "count": 41, - "num_samples": 644, - "tasks": [], - "average_score": 0.21340553041678637 - }, - "Knowledge": { - "count": 77, - "num_samples": 1294, - "tasks": [], - "average_score": 0.33417132133610217 - }, - "Mathematics": { - "count": 30, - "num_samples": 497, - "tasks": [], - "average_score": 0.14556723677922526 - }, - "Metrics": { - "count": 3, - "num_samples": 45, - "tasks": [], - "average_score": 0.3507936507936508 - }, - "Perception": { - "count": 82, - "num_samples": 1321, - "tasks": [], - "average_score": 0.3620762837308124 - }, - "Planning": { - "count": 44, - "num_samples": 714, - "tasks": [], - "average_score": 0.07517089101065139 - }, - "Science": { - "count": 22, - "num_samples": 469, - "tasks": [], - "average_score": 0.25260048981169975 - } - }, - "input_format": { - "3D Models and Aerial Imagery": { - "count": 2, - "num_samples": 30, - "tasks": [], - "average_score": 0.012567281814686655 - }, - "Artistic and Creative Content": { - "count": 22, - "num_samples": 389, - "tasks": [], - "average_score": 0.34994481629202306 - }, - "Diagrams and Data Visualizations": { - "count": 88, - "num_samples": 1524, - "tasks": [], - "average_score": 0.23021362338817897 - }, - "Photographs": { - "count": 83, - "num_samples": 1315, - "tasks": [], - "average_score": 0.3681846956052881 - }, - "Text-Based Images and Documents": { - "count": 53, - "num_samples": 847, - "tasks": [], - "average_score": 0.17128318830807052 - }, - "User Interface Screenshots": { - "count": 67, - "num_samples": 1123, - "tasks": [], - "average_score": 0.21066692306852683 - } - }, - "input_num": { - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.25947537124244935 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 63, - "num_samples": 975, - "tasks": [], - "average_score": 0.23679437883858215 - }, - "exact_text": { - "count": 57, - "num_samples": 880, - "tasks": [], - "average_score": 0.21540007432647457 - }, - "multiple_choice": { - "count": 33, - "num_samples": 567, - "tasks": [], - "average_score": 0.39586776859504136 - }, - "numerical_data": { - "count": 39, - "num_samples": 694, - "tasks": [], - "average_score": 0.2036075191422558 - }, - "open_ended_output": { - "count": 51, - "num_samples": 991, - "tasks": [], - "average_score": 0.3711731498662282 - }, - "structured_output": { - "count": 72, - "num_samples": 1121, - "tasks": [], - "average_score": 0.20284349423013687 - } - }, - "skills": { - "Commonsense and Social Reasoning": { - "count": 38, - "num_samples": 654, - "tasks": [], - "average_score": 0.45156722842924535 - }, - "Domain-Specific Knowledge and Skills": { - "count": 46, - "num_samples": 897, - "tasks": [], - "average_score": 0.2244713686485571 - }, - "Ethical and Safety Reasoning": { - "count": 10, - "num_samples": 170, - "tasks": [], - "average_score": 0.660718045112782 - }, - "Language Understanding and Generation": { - "count": 102, - "num_samples": 1713, - "tasks": [], - "average_score": 0.3045977370408878 - }, - "Mathematical and Logical Reasoning": { - "count": 91, - "num_samples": 1630, - "tasks": [], - "average_score": 0.18352505380246076 - }, - "Object Recognition and Classification": { - "count": 172, - "num_samples": 2714, - "tasks": [], - "average_score": 0.2657183000752527 - }, - "Planning and Decision Making": { - "count": 23, - "num_samples": 356, - "tasks": [], - "average_score": 0.06087615859328559 - }, - "Scene and Event Understanding": { - "count": 60, - "num_samples": 1004, - "tasks": [], - "average_score": 0.3977302205205499 - }, - "Spatial and Temporal Reasoning": { - "count": 78, - "num_samples": 1273, - "tasks": [], - "average_score": 0.17375496033997198 - }, - "Text Recognition (OCR)": { - "count": 101, - "num_samples": 1687, - "tasks": [], - "average_score": 0.26814713591233313 - } - } - }, - "Molmo_72B": { - "app": { - "Coding": { - "count": 16, - "num_samples": 244, - "tasks": [], - "average_score": 0.2582151610644257 - }, - "Information_Extraction": { - "count": 41, - "num_samples": 644, - "tasks": [], - "average_score": 0.5042591723808818 - }, - "Knowledge": { - "count": 77, - "num_samples": 1279, - "tasks": [], - "average_score": 0.39648868632862583 - }, - "Mathematics": { - "count": 30, - "num_samples": 497, - "tasks": [], - "average_score": 0.2954490282663994 - }, - "Metrics": { - "count": 3, - "num_samples": 45, - "tasks": [], - "average_score": 0.6714285714285714 - }, - "Perception": { - "count": 82, - "num_samples": 1306, - "tasks": [], - "average_score": 0.4704848349431393 - }, - "Planning": { - "count": 44, - "num_samples": 698, - "tasks": [], - "average_score": 0.13015529062282669 - }, - "Science": { - "count": 22, - "num_samples": 469, - "tasks": [], - "average_score": 0.3557374102316002 - } - }, - "input_format": { - "3D Models and Aerial Imagery": { - "count": 2, - "num_samples": 30, - "tasks": [], - "average_score": 0.18757766329699532 - }, - "Artistic and Creative Content": { - "count": 22, - "num_samples": 389, - "tasks": [], - "average_score": 0.4405271103381682 - }, - "Diagrams and Data Visualizations": { - "count": 88, - "num_samples": 1509, - "tasks": [], - "average_score": 0.35176591065677537 - }, - "Photographs": { - "count": 83, - "num_samples": 1300, - "tasks": [], - "average_score": 0.47052754190598134 - }, - "Text-Based Images and Documents": { - "count": 53, - "num_samples": 847, - "tasks": [], - "average_score": 0.24743187516175363 - }, - "User Interface Screenshots": { - "count": 67, - "num_samples": 1107, - "tasks": [], - "average_score": 0.3754692399771127 - } - }, - "input_num": { - "1-image": { - "count": 315, - "num_samples": 5182, - "tasks": [], - "average_score": 0.3757024328002091 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 63, - "num_samples": 975, - "tasks": [], - "average_score": 0.3048441329189725 - }, - "exact_text": { - "count": 57, - "num_samples": 880, - "tasks": [], - "average_score": 0.36443166533642163 - }, - "multiple_choice": { - "count": 33, - "num_samples": 552, - "tasks": [], - "average_score": 0.5421225239407056 - }, - "numerical_data": { - "count": 39, - "num_samples": 679, - "tasks": [], - "average_score": 0.3342330361070466 - }, - "open_ended_output": { - "count": 51, - "num_samples": 991, - "tasks": [], - "average_score": 0.4120820025247545 - }, - "structured_output": { - "count": 72, - "num_samples": 1105, - "tasks": [], - "average_score": 0.3670439889863054 - } - }, - "skills": { - "Commonsense and Social Reasoning": { - "count": 38, - "num_samples": 639, - "tasks": [], - "average_score": 0.445412976139552 - }, - "Domain-Specific Knowledge and Skills": { - "count": 46, - "num_samples": 897, - "tasks": [], - "average_score": 0.3070615049173117 - }, - "Ethical and Safety Reasoning": { - "count": 10, - "num_samples": 155, - "tasks": [], - "average_score": 0.5953120300751881 - }, - "Language Understanding and Generation": { - "count": 102, - "num_samples": 1698, - "tasks": [], - "average_score": 0.4110431137367615 - }, - "Mathematical and Logical Reasoning": { - "count": 91, - "num_samples": 1615, - "tasks": [], - "average_score": 0.2983397150768741 - }, - "Object Recognition and Classification": { - "count": 172, - "num_samples": 2714, - "tasks": [], - "average_score": 0.4223762317042425 - }, - "Planning and Decision Making": { - "count": 23, - "num_samples": 340, - "tasks": [], - "average_score": 0.07825953913967484 - }, - "Scene and Event Understanding": { - "count": 60, - "num_samples": 1004, - "tasks": [], - "average_score": 0.5756984198310193 - }, - "Spatial and Temporal Reasoning": { - "count": 78, - "num_samples": 1257, - "tasks": [], - "average_score": 0.29197652844726363 - }, - "Text Recognition (OCR)": { - "count": 101, - "num_samples": 1672, - "tasks": [], - "average_score": 0.41462128751753047 - } - } - }, - "Molmo_7B_D": { - "app": { - "Coding": { - "count": 16, - "num_samples": 244, - "tasks": [], - "average_score": 0.1158110119047619 - }, - "Information_Extraction": { - "count": 41, - "num_samples": 644, - "tasks": [], - "average_score": 0.27184002856754413 - }, - "Knowledge": { - "count": 77, - "num_samples": 1279, - "tasks": [], - "average_score": 0.2787344822161389 - }, - "Mathematics": { - "count": 30, - "num_samples": 497, - "tasks": [], - "average_score": 0.1740048655548875 - }, - "Metrics": { - "count": 3, - "num_samples": 45, - "tasks": [], - "average_score": 0.3619047619047619 - }, - "Perception": { - "count": 82, - "num_samples": 1321, - "tasks": [], - "average_score": 0.30311570603428784 - }, - "Planning": { - "count": 44, - "num_samples": 714, - "tasks": [], - "average_score": 0.06424366688759706 - }, - "Science": { - "count": 22, - "num_samples": 469, - "tasks": [], - "average_score": 0.173722800705029 - } - }, - "input_format": { - "3D Models and Aerial Imagery": { - "count": 2, - "num_samples": 30, - "tasks": [], - "average_score": 0.09043432702433757 - }, - "Artistic and Creative Content": { - "count": 22, - "num_samples": 389, - "tasks": [], - "average_score": 0.3106093738160722 - }, - "Diagrams and Data Visualizations": { - "count": 88, - "num_samples": 1524, - "tasks": [], - "average_score": 0.21356852314768052 - }, - "Photographs": { - "count": 83, - "num_samples": 1300, - "tasks": [], - "average_score": 0.323282724310645 - }, - "Text-Based Images and Documents": { - "count": 53, - "num_samples": 847, - "tasks": [], - "average_score": 0.1327652104313917 - }, - "User Interface Screenshots": { - "count": 67, - "num_samples": 1123, - "tasks": [], - "average_score": 0.18796442406686825 - } - }, - "input_num": { - "1-image": { - "count": 315, - "num_samples": 5213, - "tasks": [], - "average_score": 0.22943156697817663 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 63, - "num_samples": 975, - "tasks": [], - "average_score": 0.17305260714177756 - }, - "exact_text": { - "count": 57, - "num_samples": 880, - "tasks": [], - "average_score": 0.17907829453546903 - }, - "multiple_choice": { - "count": 33, - "num_samples": 552, - "tasks": [], - "average_score": 0.3169618260527351 - }, - "numerical_data": { - "count": 39, - "num_samples": 694, - "tasks": [], - "average_score": 0.22086240998395923 - }, - "open_ended_output": { - "count": 51, - "num_samples": 991, - "tasks": [], - "average_score": 0.324079404512755 - }, - "structured_output": { - "count": 72, - "num_samples": 1121, - "tasks": [], - "average_score": 0.21610753722787088 - } - }, - "skills": { - "Commonsense and Social Reasoning": { - "count": 38, - "num_samples": 639, - "tasks": [], - "average_score": 0.32356781790614975 - }, - "Domain-Specific Knowledge and Skills": { - "count": 46, - "num_samples": 897, - "tasks": [], - "average_score": 0.19244928377978027 - }, - "Ethical and Safety Reasoning": { - "count": 10, - "num_samples": 155, - "tasks": [], - "average_score": 0.4433947368421053 - }, - "Language Understanding and Generation": { - "count": 102, - "num_samples": 1698, - "tasks": [], - "average_score": 0.25685172601108597 - }, - "Mathematical and Logical Reasoning": { - "count": 91, - "num_samples": 1630, - "tasks": [], - "average_score": 0.17259103199957743 - }, - "Object Recognition and Classification": { - "count": 172, - "num_samples": 2714, - "tasks": [], - "average_score": 0.24958564675030656 - }, - "Planning and Decision Making": { - "count": 23, - "num_samples": 356, - "tasks": [], - "average_score": 0.035588894400059294 - }, - "Scene and Event Understanding": { - "count": 60, - "num_samples": 1004, - "tasks": [], - "average_score": 0.35830528296805764 - }, - "Spatial and Temporal Reasoning": { - "count": 78, - "num_samples": 1273, - "tasks": [], - "average_score": 0.1939605648275455 - }, - "Text Recognition (OCR)": { - "count": 101, - "num_samples": 1687, - "tasks": [], - "average_score": 0.2239160707791646 - } - } - }, - "NVLM": { - "app": { - "Coding": { - "count": 16, - "num_samples": 244, - "tasks": [], - "average_score": 0.37153871965452856 - }, - "Information_Extraction": { - "count": 41, - "num_samples": 644, - "tasks": [], - "average_score": 0.352859881186271 - }, - "Knowledge": { - "count": 77, - "num_samples": 1294, - "tasks": [], - "average_score": 0.37572531212341936 - }, - "Mathematics": { - "count": 30, - "num_samples": 497, - "tasks": [], - "average_score": 0.2786818799518423 - }, - "Metrics": { - "count": 3, - "num_samples": 45, - "tasks": [], - "average_score": 0.3047619047619048 - }, - "Perception": { - "count": 82, - "num_samples": 1321, - "tasks": [], - "average_score": 0.45079588183469294 - }, - "Planning": { - "count": 44, - "num_samples": 714, - "tasks": [], - "average_score": 0.1252138046141793 - }, - "Science": { - "count": 22, - "num_samples": 469, - "tasks": [], - "average_score": 0.3518857602487131 - } - }, - "input_format": { - "3D Models and Aerial Imagery": { - "count": 2, - "num_samples": 30, - "tasks": [], - "average_score": 0.09447890526012262 - }, - "Artistic and Creative Content": { - "count": 22, - "num_samples": 389, - "tasks": [], - "average_score": 0.4387718807206103 - }, - "Diagrams and Data Visualizations": { - "count": 88, - "num_samples": 1524, - "tasks": [], - "average_score": 0.32094439294995036 - }, - "Photographs": { - "count": 83, - "num_samples": 1315, - "tasks": [], - "average_score": 0.4332099707344069 - }, - "Text-Based Images and Documents": { - "count": 53, - "num_samples": 847, - "tasks": [], - "average_score": 0.30070480033875985 - }, - "User Interface Screenshots": { - "count": 67, - "num_samples": 1123, - "tasks": [], - "average_score": 0.2814148428882822 - } - }, - "input_num": { - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.345503562629823 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 63, - "num_samples": 975, - "tasks": [], - "average_score": 0.3215154320779893 - }, - "exact_text": { - "count": 57, - "num_samples": 880, - "tasks": [], - "average_score": 0.29287492253780084 - }, - "multiple_choice": { - "count": 33, - "num_samples": 567, - "tasks": [], - "average_score": 0.5016004197822379 - }, - "numerical_data": { - "count": 39, - "num_samples": 694, - "tasks": [], - "average_score": 0.28793758479482745 - }, - "open_ended_output": { - "count": 51, - "num_samples": 991, - "tasks": [], - "average_score": 0.3828322321439372 - }, - "structured_output": { - "count": 72, - "num_samples": 1121, - "tasks": [], - "average_score": 0.34135355449546323 - } - }, - "skills": { - "Commonsense and Social Reasoning": { - "count": 38, - "num_samples": 654, - "tasks": [], - "average_score": 0.45915496990325566 - }, - "Domain-Specific Knowledge and Skills": { - "count": 46, - "num_samples": 897, - "tasks": [], - "average_score": 0.3152573721587561 - }, - "Ethical and Safety Reasoning": { - "count": 10, - "num_samples": 170, - "tasks": [], - "average_score": 0.6521954887218044 - }, - "Language Understanding and Generation": { - "count": 102, - "num_samples": 1713, - "tasks": [], - "average_score": 0.38986101015677044 - }, - "Mathematical and Logical Reasoning": { - "count": 91, - "num_samples": 1630, - "tasks": [], - "average_score": 0.30043411704099793 - }, - "Object Recognition and Classification": { - "count": 172, - "num_samples": 2714, - "tasks": [], - "average_score": 0.3359094293956291 - }, - "Planning and Decision Making": { - "count": 23, - "num_samples": 356, - "tasks": [], - "average_score": 0.07615011020932495 - }, - "Scene and Event Understanding": { - "count": 60, - "num_samples": 1004, - "tasks": [], - "average_score": 0.46386896656934745 - }, - "Spatial and Temporal Reasoning": { - "count": 78, - "num_samples": 1273, - "tasks": [], - "average_score": 0.26907670581189963 - }, - "Text Recognition (OCR)": { - "count": 101, - "num_samples": 1687, - "tasks": [], - "average_score": 0.3943476764428869 - } - } - }, - "POINTS_15_7B": { - "app": { - "Coding": { - "count": 16, - "num_samples": 244, - "tasks": [], - "average_score": 0.31641062675070025 - }, - "Information_Extraction": { - "count": 41, - "num_samples": 644, - "tasks": [], - "average_score": 0.3095789895735217 - }, - "Knowledge": { - "count": 77, - "num_samples": 1294, - "tasks": [], - "average_score": 0.35705988992418164 - }, - "Mathematics": { - "count": 30, - "num_samples": 497, - "tasks": [], - "average_score": 0.24128406446063128 - }, - "Metrics": { - "count": 3, - "num_samples": 45, - "tasks": [], - "average_score": 0.48095238095238096 - }, - "Perception": { - "count": 82, - "num_samples": 1321, - "tasks": [], - "average_score": 0.4420532221275683 - }, - "Planning": { - "count": 44, - "num_samples": 714, - "tasks": [], - "average_score": 0.1277481304284383 - }, - "Science": { - "count": 22, - "num_samples": 469, - "tasks": [], - "average_score": 0.32551503611448934 - } - }, - "input_format": { - "3D Models and Aerial Imagery": { - "count": 2, - "num_samples": 30, - "tasks": [], - "average_score": 0.15572486552610215 - }, - "Artistic and Creative Content": { - "count": 22, - "num_samples": 389, - "tasks": [], - "average_score": 0.37330010041194067 - }, - "Diagrams and Data Visualizations": { - "count": 88, - "num_samples": 1524, - "tasks": [], - "average_score": 0.30991539183635347 - }, - "Photographs": { - "count": 83, - "num_samples": 1315, - "tasks": [], - "average_score": 0.4276343385855984 - }, - "Text-Based Images and Documents": { - "count": 53, - "num_samples": 847, - "tasks": [], - "average_score": 0.24722440389191766 - }, - "User Interface Screenshots": { - "count": 67, - "num_samples": 1123, - "tasks": [], - "average_score": 0.27713077639707523 - } - }, - "input_num": { - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.32686003793394974 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 63, - "num_samples": 975, - "tasks": [], - "average_score": 0.3101162129247054 - }, - "exact_text": { - "count": 57, - "num_samples": 880, - "tasks": [], - "average_score": 0.2614010338203017 - }, - "multiple_choice": { - "count": 33, - "num_samples": 567, - "tasks": [], - "average_score": 0.4855568673750491 - }, - "numerical_data": { - "count": 39, - "num_samples": 694, - "tasks": [], - "average_score": 0.28761899055063767 - }, - "open_ended_output": { - "count": 51, - "num_samples": 991, - "tasks": [], - "average_score": 0.37619796536407 - }, - "structured_output": { - "count": 72, - "num_samples": 1121, - "tasks": [], - "average_score": 0.3069044183161335 - } - }, - "skills": { - "Commonsense and Social Reasoning": { - "count": 38, - "num_samples": 654, - "tasks": [], - "average_score": 0.45980379926019677 - }, - "Domain-Specific Knowledge and Skills": { - "count": 46, - "num_samples": 897, - "tasks": [], - "average_score": 0.30711751050032277 - }, - "Ethical and Safety Reasoning": { - "count": 10, - "num_samples": 170, - "tasks": [], - "average_score": 0.6173496240601504 - }, - "Language Understanding and Generation": { - "count": 102, - "num_samples": 1713, - "tasks": [], - "average_score": 0.35317851821169477 - }, - "Mathematical and Logical Reasoning": { - "count": 91, - "num_samples": 1630, - "tasks": [], - "average_score": 0.28961632718794406 - }, - "Object Recognition and Classification": { - "count": 172, - "num_samples": 2714, - "tasks": [], - "average_score": 0.3333459246264911 - }, - "Planning and Decision Making": { - "count": 23, - "num_samples": 356, - "tasks": [], - "average_score": 0.08369131166291023 - }, - "Scene and Event Understanding": { - "count": 60, - "num_samples": 1004, - "tasks": [], - "average_score": 0.43105364189963935 - }, - "Spatial and Temporal Reasoning": { - "count": 78, - "num_samples": 1273, - "tasks": [], - "average_score": 0.26796963300870397 - }, - "Text Recognition (OCR)": { - "count": 101, - "num_samples": 1687, - "tasks": [], - "average_score": 0.3443899066327916 - } - } - }, - "POINTS_7B": { - "app": { - "Coding": { - "count": 16, - "num_samples": 244, - "tasks": [], - "average_score": 0.3151282387955181 - }, - "Information_Extraction": { - "count": 41, - "num_samples": 644, - "tasks": [], - "average_score": 0.22503259387671015 - }, - "Knowledge": { - "count": 77, - "num_samples": 1294, - "tasks": [], - "average_score": 0.27361452525243724 - }, - "Mathematics": { - "count": 30, - "num_samples": 497, - "tasks": [], - "average_score": 0.19633555542091463 - }, - "Metrics": { - "count": 3, - "num_samples": 45, - "tasks": [], - "average_score": 0.34761904761904755 - }, - "Perception": { - "count": 82, - "num_samples": 1321, - "tasks": [], - "average_score": 0.3737263982731003 - }, - "Planning": { - "count": 44, - "num_samples": 714, - "tasks": [], - "average_score": 0.08476480516686397 - }, - "Science": { - "count": 22, - "num_samples": 469, - "tasks": [], - "average_score": 0.2606187882141402 - } - }, - "input_format": { - "3D Models and Aerial Imagery": { - "count": 2, - "num_samples": 30, - "tasks": [], - "average_score": 0.1499797713556708 - }, - "Artistic and Creative Content": { - "count": 22, - "num_samples": 389, - "tasks": [], - "average_score": 0.33916980654110634 - }, - "Diagrams and Data Visualizations": { - "count": 88, - "num_samples": 1524, - "tasks": [], - "average_score": 0.25684059763242745 - }, - "Photographs": { - "count": 83, - "num_samples": 1315, - "tasks": [], - "average_score": 0.3523684400745285 - }, - "Text-Based Images and Documents": { - "count": 53, - "num_samples": 847, - "tasks": [], - "average_score": 0.19332242733156837 - }, - "User Interface Screenshots": { - "count": 67, - "num_samples": 1123, - "tasks": [], - "average_score": 0.18689735511962233 - } - }, - "input_num": { - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.2615189201461682 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 63, - "num_samples": 975, - "tasks": [], - "average_score": 0.23004840221723208 - }, - "exact_text": { - "count": 57, - "num_samples": 880, - "tasks": [], - "average_score": 0.239982641771955 - }, - "multiple_choice": { - "count": 33, - "num_samples": 567, - "tasks": [], - "average_score": 0.4200183654729108 - }, - "numerical_data": { - "count": 39, - "num_samples": 694, - "tasks": [], - "average_score": 0.23646374895042882 - }, - "open_ended_output": { - "count": 51, - "num_samples": 991, - "tasks": [], - "average_score": 0.28263350209672056 - }, - "structured_output": { - "count": 72, - "num_samples": 1121, - "tasks": [], - "average_score": 0.2320749867881998 - } - }, - "skills": { - "Commonsense and Social Reasoning": { - "count": 38, - "num_samples": 654, - "tasks": [], - "average_score": 0.36827291874151846 - }, - "Domain-Specific Knowledge and Skills": { - "count": 46, - "num_samples": 897, - "tasks": [], - "average_score": 0.21311917080615544 - }, - "Ethical and Safety Reasoning": { - "count": 10, - "num_samples": 170, - "tasks": [], - "average_score": 0.48204135338345855 - }, - "Language Understanding and Generation": { - "count": 102, - "num_samples": 1713, - "tasks": [], - "average_score": 0.2799740367463896 - }, - "Mathematical and Logical Reasoning": { - "count": 91, - "num_samples": 1630, - "tasks": [], - "average_score": 0.22387504020162652 - }, - "Object Recognition and Classification": { - "count": 172, - "num_samples": 2714, - "tasks": [], - "average_score": 0.27890902837062037 - }, - "Planning and Decision Making": { - "count": 23, - "num_samples": 356, - "tasks": [], - "average_score": 0.06502747891786666 - }, - "Scene and Event Understanding": { - "count": 60, - "num_samples": 1004, - "tasks": [], - "average_score": 0.37373928195086786 - }, - "Spatial and Temporal Reasoning": { - "count": 78, - "num_samples": 1273, - "tasks": [], - "average_score": 0.21857370538972226 - }, - "Text Recognition (OCR)": { - "count": 101, - "num_samples": 1687, - "tasks": [], - "average_score": 0.2684488868499041 - } - } - }, - "Phi-3.5-vision": { - "app": { - "Coding": { - "count": 16, - "num_samples": 244, - "tasks": [], - "average_score": 0.3150531045751634 - }, - "Information_Extraction": { - "count": 41, - "num_samples": 644, - "tasks": [], - "average_score": 0.18412184931451608 - }, - "Knowledge": { - "count": 77, - "num_samples": 1294, - "tasks": [], - "average_score": 0.3374902354661273 - }, - "Mathematics": { - "count": 30, - "num_samples": 497, - "tasks": [], - "average_score": 0.19473774010136682 - }, - "Metrics": { - "count": 3, - "num_samples": 45, - "tasks": [], - "average_score": 0.4142857142857143 - }, - "Perception": { - "count": 82, - "num_samples": 1321, - "tasks": [], - "average_score": 0.38360617573843164 - }, - "Planning": { - "count": 44, - "num_samples": 714, - "tasks": [], - "average_score": 0.09254779551496593 - }, - "Science": { - "count": 22, - "num_samples": 469, - "tasks": [], - "average_score": 0.3034971430938622 - } - }, - "input_format": { - "3D Models and Aerial Imagery": { - "count": 2, - "num_samples": 30, - "tasks": [], - "average_score": 0.04423070234557053 - }, - "Artistic and Creative Content": { - "count": 22, - "num_samples": 389, - "tasks": [], - "average_score": 0.3249099963089235 - }, - "Diagrams and Data Visualizations": { - "count": 88, - "num_samples": 1524, - "tasks": [], - "average_score": 0.2797292831010349 - }, - "Photographs": { - "count": 83, - "num_samples": 1315, - "tasks": [], - "average_score": 0.4073649042468842 - }, - "Text-Based Images and Documents": { - "count": 53, - "num_samples": 847, - "tasks": [], - "average_score": 0.1852656532829957 - }, - "User Interface Screenshots": { - "count": 67, - "num_samples": 1123, - "tasks": [], - "average_score": 0.18482544209393917 - } - }, - "input_num": { - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.2789407286767065 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 63, - "num_samples": 975, - "tasks": [], - "average_score": 0.2141318618135909 - }, - "exact_text": { - "count": 57, - "num_samples": 880, - "tasks": [], - "average_score": 0.23002523914604356 - }, - "multiple_choice": { - "count": 33, - "num_samples": 567, - "tasks": [], - "average_score": 0.46076785167694245 - }, - "numerical_data": { - "count": 39, - "num_samples": 694, - "tasks": [], - "average_score": 0.20335546763980886 - }, - "open_ended_output": { - "count": 51, - "num_samples": 991, - "tasks": [], - "average_score": 0.38510487366381607 - }, - "structured_output": { - "count": 72, - "num_samples": 1121, - "tasks": [], - "average_score": 0.2567782320477167 - } - }, - "skills": { - "Commonsense and Social Reasoning": { - "count": 38, - "num_samples": 654, - "tasks": [], - "average_score": 0.44526176399160444 - }, - "Domain-Specific Knowledge and Skills": { - "count": 46, - "num_samples": 897, - "tasks": [], - "average_score": 0.26422404318271525 - }, - "Ethical and Safety Reasoning": { - "count": 10, - "num_samples": 170, - "tasks": [], - "average_score": 0.6958045112781954 - }, - "Language Understanding and Generation": { - "count": 102, - "num_samples": 1713, - "tasks": [], - "average_score": 0.3097558922032538 - }, - "Mathematical and Logical Reasoning": { - "count": 91, - "num_samples": 1630, - "tasks": [], - "average_score": 0.22905610983444738 - }, - "Object Recognition and Classification": { - "count": 172, - "num_samples": 2714, - "tasks": [], - "average_score": 0.2845968124529633 - }, - "Planning and Decision Making": { - "count": 23, - "num_samples": 356, - "tasks": [], - "average_score": 0.08173397535709728 - }, - "Scene and Event Understanding": { - "count": 60, - "num_samples": 1004, - "tasks": [], - "average_score": 0.4299430434813172 - }, - "Spatial and Temporal Reasoning": { - "count": 78, - "num_samples": 1273, - "tasks": [], - "average_score": 0.21524515429041854 - }, - "Text Recognition (OCR)": { - "count": 101, - "num_samples": 1687, - "tasks": [], - "average_score": 0.2682909697086125 - } - } - }, - "Pixtral_12B": { - "app": { - "Coding": { - "count": 16, - "num_samples": 244, - "tasks": [], - "average_score": 0.3689221521942111 - }, - "Information_Extraction": { - "count": 41, - "num_samples": 644, - "tasks": [], - "average_score": 0.4143415072482432 - }, - "Knowledge": { - "count": 77, - "num_samples": 1294, - "tasks": [], - "average_score": 0.37374171749764634 - }, - "Mathematics": { - "count": 30, - "num_samples": 497, - "tasks": [], - "average_score": 0.27839183583970506 - }, - "Metrics": { - "count": 3, - "num_samples": 45, - "tasks": [], - "average_score": 0.3444444444444444 - }, - "Perception": { - "count": 82, - "num_samples": 1321, - "tasks": [], - "average_score": 0.46377210154054166 - }, - "Planning": { - "count": 44, - "num_samples": 714, - "tasks": [], - "average_score": 0.122839038193565 - }, - "Science": { - "count": 22, - "num_samples": 469, - "tasks": [], - "average_score": 0.35876745089800455 - } - }, - "input_format": { - "3D Models and Aerial Imagery": { - "count": 2, - "num_samples": 30, - "tasks": [], - "average_score": 0.11048396896880823 - }, - "Artistic and Creative Content": { - "count": 22, - "num_samples": 389, - "tasks": [], - "average_score": 0.3947713702430871 - }, - "Diagrams and Data Visualizations": { - "count": 88, - "num_samples": 1524, - "tasks": [], - "average_score": 0.36461586731895695 - }, - "Photographs": { - "count": 83, - "num_samples": 1315, - "tasks": [], - "average_score": 0.4327891810625066 - }, - "Text-Based Images and Documents": { - "count": 53, - "num_samples": 847, - "tasks": [], - "average_score": 0.2688429906381188 - }, - "User Interface Screenshots": { - "count": 67, - "num_samples": 1123, - "tasks": [], - "average_score": 0.31669784888602887 - } - }, - "input_num": { - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.3567653041737331 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 63, - "num_samples": 975, - "tasks": [], - "average_score": 0.3161209026018942 - }, - "exact_text": { - "count": 57, - "num_samples": 880, - "tasks": [], - "average_score": 0.29510067482559116 - }, - "multiple_choice": { - "count": 33, - "num_samples": 567, - "tasks": [], - "average_score": 0.5076172985263894 - }, - "numerical_data": { - "count": 39, - "num_samples": 694, - "tasks": [], - "average_score": 0.3135393276021012 - }, - "open_ended_output": { - "count": 51, - "num_samples": 991, - "tasks": [], - "average_score": 0.3995518703501119 - }, - "structured_output": { - "count": 72, - "num_samples": 1121, - "tasks": [], - "average_score": 0.36511340930610364 - } - }, - "skills": { - "Commonsense and Social Reasoning": { - "count": 38, - "num_samples": 654, - "tasks": [], - "average_score": 0.4193828210432134 - }, - "Domain-Specific Knowledge and Skills": { - "count": 46, - "num_samples": 897, - "tasks": [], - "average_score": 0.35085932465399283 - }, - "Ethical and Safety Reasoning": { - "count": 10, - "num_samples": 170, - "tasks": [], - "average_score": 0.6302142857142857 - }, - "Language Understanding and Generation": { - "count": 102, - "num_samples": 1713, - "tasks": [], - "average_score": 0.38842270268832113 - }, - "Mathematical and Logical Reasoning": { - "count": 91, - "num_samples": 1630, - "tasks": [], - "average_score": 0.3055711926752603 - }, - "Object Recognition and Classification": { - "count": 172, - "num_samples": 2714, - "tasks": [], - "average_score": 0.37359181974124417 - }, - "Planning and Decision Making": { - "count": 23, - "num_samples": 356, - "tasks": [], - "average_score": 0.08507904212012304 - }, - "Scene and Event Understanding": { - "count": 60, - "num_samples": 1004, - "tasks": [], - "average_score": 0.4677006268371793 - }, - "Spatial and Temporal Reasoning": { - "count": 78, - "num_samples": 1273, - "tasks": [], - "average_score": 0.28269833721806076 - }, - "Text Recognition (OCR)": { - "count": 101, - "num_samples": 1687, - "tasks": [], - "average_score": 0.39551360119171197 - } - } - }, - "Qwen2_VL_2B": { - "app": { - "Coding": { - "count": 16, - "num_samples": 244, - "tasks": [], - "average_score": 0.18075323879551822 - }, - "Information_Extraction": { - "count": 41, - "num_samples": 644, - "tasks": [], - "average_score": 0.21948696002702636 - }, - "Knowledge": { - "count": 77, - "num_samples": 1294, - "tasks": [], - "average_score": 0.28841305815072016 - }, - "Mathematics": { - "count": 30, - "num_samples": 497, - "tasks": [], - "average_score": 0.16147424237969243 - }, - "Metrics": { - "count": 3, - "num_samples": 45, - "tasks": [], - "average_score": 0.36984126984126986 - }, - "Perception": { - "count": 82, - "num_samples": 1321, - "tasks": [], - "average_score": 0.33781829803679747 - }, - "Planning": { - "count": 44, - "num_samples": 714, - "tasks": [], - "average_score": 0.08656714113327156 - }, - "Science": { - "count": 22, - "num_samples": 469, - "tasks": [], - "average_score": 0.2448949597527861 - } - }, - "input_format": { - "3D Models and Aerial Imagery": { - "count": 2, - "num_samples": 30, - "tasks": [], - "average_score": 0.09293971931071163 - }, - "Artistic and Creative Content": { - "count": 22, - "num_samples": 389, - "tasks": [], - "average_score": 0.2842921728720087 - }, - "Diagrams and Data Visualizations": { - "count": 88, - "num_samples": 1524, - "tasks": [], - "average_score": 0.23259922343062173 - }, - "Photographs": { - "count": 83, - "num_samples": 1315, - "tasks": [], - "average_score": 0.36205043973893236 - }, - "Text-Based Images and Documents": { - "count": 53, - "num_samples": 847, - "tasks": [], - "average_score": 0.13312812081322709 - }, - "User Interface Screenshots": { - "count": 67, - "num_samples": 1123, - "tasks": [], - "average_score": 0.1930642044577058 - } - }, - "input_num": { - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.24428672223428244 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 63, - "num_samples": 975, - "tasks": [], - "average_score": 0.1652854805017628 - }, - "exact_text": { - "count": 57, - "num_samples": 880, - "tasks": [], - "average_score": 0.17061075451792151 - }, - "multiple_choice": { - "count": 33, - "num_samples": 567, - "tasks": [], - "average_score": 0.42328479601206864 - }, - "numerical_data": { - "count": 39, - "num_samples": 694, - "tasks": [], - "average_score": 0.23904036592289388 - }, - "open_ended_output": { - "count": 51, - "num_samples": 991, - "tasks": [], - "average_score": 0.3296071840681468 - }, - "structured_output": { - "count": 72, - "num_samples": 1121, - "tasks": [], - "average_score": 0.23210528191388644 - } - }, - "skills": { - "Commonsense and Social Reasoning": { - "count": 38, - "num_samples": 654, - "tasks": [], - "average_score": 0.37769658880841467 - }, - "Domain-Specific Knowledge and Skills": { - "count": 46, - "num_samples": 897, - "tasks": [], - "average_score": 0.21906286524745977 - }, - "Ethical and Safety Reasoning": { - "count": 10, - "num_samples": 170, - "tasks": [], - "average_score": 0.5887067669172933 - }, - "Language Understanding and Generation": { - "count": 102, - "num_samples": 1713, - "tasks": [], - "average_score": 0.27091980735233906 - }, - "Mathematical and Logical Reasoning": { - "count": 91, - "num_samples": 1630, - "tasks": [], - "average_score": 0.19211647307230917 - }, - "Object Recognition and Classification": { - "count": 172, - "num_samples": 2714, - "tasks": [], - "average_score": 0.25965511679594977 - }, - "Planning and Decision Making": { - "count": 23, - "num_samples": 356, - "tasks": [], - "average_score": 0.07432337143230854 - }, - "Scene and Event Understanding": { - "count": 60, - "num_samples": 1004, - "tasks": [], - "average_score": 0.3778480095314066 - }, - "Spatial and Temporal Reasoning": { - "count": 78, - "num_samples": 1273, - "tasks": [], - "average_score": 0.19305913502727232 - }, - "Text Recognition (OCR)": { - "count": 101, - "num_samples": 1687, - "tasks": [], - "average_score": 0.2253353309586889 - } - } - }, - "Qwen2_VL_72B": { - "app": { - "Coding": { - "count": 16, - "num_samples": 244, - "tasks": [], - "average_score": 0.48352372198879545 - }, - "Information_Extraction": { - "count": 41, - "num_samples": 644, - "tasks": [], - "average_score": 0.6323628750211533 - }, - "Knowledge": { - "count": 77, - "num_samples": 1294, - "tasks": [], - "average_score": 0.4874613649312476 - }, - "Mathematics": { - "count": 30, - "num_samples": 497, - "tasks": [], - "average_score": 0.3355316008767396 - }, - "Metrics": { - "count": 3, - "num_samples": 45, - "tasks": [], - "average_score": 0.39365079365079364 - }, - "Perception": { - "count": 82, - "num_samples": 1321, - "tasks": [], - "average_score": 0.6141225191470527 - }, - "Planning": { - "count": 44, - "num_samples": 714, - "tasks": [], - "average_score": 0.23323065689783842 - }, - "Science": { - "count": 22, - "num_samples": 469, - "tasks": [], - "average_score": 0.41914085094672937 - } - }, - "input_format": { - "3D Models and Aerial Imagery": { - "count": 2, - "num_samples": 30, - "tasks": [], - "average_score": 0.18309869697155778 - }, - "Artistic and Creative Content": { - "count": 22, - "num_samples": 389, - "tasks": [], - "average_score": 0.5251544991587351 - }, - "Diagrams and Data Visualizations": { - "count": 88, - "num_samples": 1524, - "tasks": [], - "average_score": 0.4473618716373871 - }, - "Photographs": { - "count": 83, - "num_samples": 1315, - "tasks": [], - "average_score": 0.5308367876160253 - }, - "Text-Based Images and Documents": { - "count": 53, - "num_samples": 847, - "tasks": [], - "average_score": 0.4333175250859433 - }, - "User Interface Screenshots": { - "count": 67, - "num_samples": 1123, - "tasks": [], - "average_score": 0.5070634902661117 - } - }, - "input_num": { - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.4834475464413966 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 63, - "num_samples": 975, - "tasks": [], - "average_score": 0.45605294241715827 - }, - "exact_text": { - "count": 57, - "num_samples": 880, - "tasks": [], - "average_score": 0.4608929319719144 - }, - "multiple_choice": { - "count": 33, - "num_samples": 567, - "tasks": [], - "average_score": 0.5851458306003763 - }, - "numerical_data": { - "count": 39, - "num_samples": 694, - "tasks": [], - "average_score": 0.44066773476234555 - }, - "open_ended_output": { - "count": 51, - "num_samples": 991, - "tasks": [], - "average_score": 0.4974532098882374 - }, - "structured_output": { - "count": 72, - "num_samples": 1121, - "tasks": [], - "average_score": 0.49191356756271953 - } - }, - "skills": { - "Commonsense and Social Reasoning": { - "count": 38, - "num_samples": 654, - "tasks": [], - "average_score": 0.5782670824874114 - }, - "Domain-Specific Knowledge and Skills": { - "count": 46, - "num_samples": 897, - "tasks": [], - "average_score": 0.43580017776139807 - }, - "Ethical and Safety Reasoning": { - "count": 10, - "num_samples": 170, - "tasks": [], - "average_score": 0.7294097744360902 - }, - "Language Understanding and Generation": { - "count": 102, - "num_samples": 1713, - "tasks": [], - "average_score": 0.5387802130987105 - }, - "Mathematical and Logical Reasoning": { - "count": 91, - "num_samples": 1630, - "tasks": [], - "average_score": 0.40095954140813556 - }, - "Object Recognition and Classification": { - "count": 172, - "num_samples": 2714, - "tasks": [], - "average_score": 0.49559260360544427 - }, - "Planning and Decision Making": { - "count": 23, - "num_samples": 356, - "tasks": [], - "average_score": 0.1474368760019346 - }, - "Scene and Event Understanding": { - "count": 60, - "num_samples": 1004, - "tasks": [], - "average_score": 0.6040487985710314 - }, - "Spatial and Temporal Reasoning": { - "count": 78, - "num_samples": 1273, - "tasks": [], - "average_score": 0.367367170491919 - }, - "Text Recognition (OCR)": { - "count": 101, - "num_samples": 1687, - "tasks": [], - "average_score": 0.5688395686544739 - } - } - }, - "Qwen2_VL_7B": { - "app": { - "Coding": { - "count": 16, - "num_samples": 244, - "tasks": [], - "average_score": 0.36572347689075624 - }, - "Information_Extraction": { - "count": 41, - "num_samples": 644, - "tasks": [], - "average_score": 0.44618631789079277 - }, - "Knowledge": { - "count": 77, - "num_samples": 1294, - "tasks": [], - "average_score": 0.40527029084195965 - }, - "Mathematics": { - "count": 30, - "num_samples": 497, - "tasks": [], - "average_score": 0.25874500882297563 - }, - "Metrics": { - "count": 3, - "num_samples": 45, - "tasks": [], - "average_score": 0.3507936507936508 - }, - "Perception": { - "count": 82, - "num_samples": 1321, - "tasks": [], - "average_score": 0.47845712831657317 - }, - "Planning": { - "count": 44, - "num_samples": 714, - "tasks": [], - "average_score": 0.13224920829749706 - }, - "Science": { - "count": 22, - "num_samples": 469, - "tasks": [], - "average_score": 0.28910547521894076 - } - }, - "input_format": { - "3D Models and Aerial Imagery": { - "count": 2, - "num_samples": 30, - "tasks": [], - "average_score": 0.167887099917599 - }, - "Artistic and Creative Content": { - "count": 22, - "num_samples": 389, - "tasks": [], - "average_score": 0.39575781162159634 - }, - "Diagrams and Data Visualizations": { - "count": 88, - "num_samples": 1524, - "tasks": [], - "average_score": 0.3279988413468837 - }, - "Photographs": { - "count": 83, - "num_samples": 1315, - "tasks": [], - "average_score": 0.4722059967533397 - }, - "Text-Based Images and Documents": { - "count": 53, - "num_samples": 847, - "tasks": [], - "average_score": 0.27651089530142536 - }, - "User Interface Screenshots": { - "count": 67, - "num_samples": 1123, - "tasks": [], - "average_score": 0.3555822260000372 - } - }, - "input_num": { - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.3669159632302898 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 63, - "num_samples": 975, - "tasks": [], - "average_score": 0.3343930759222326 - }, - "exact_text": { - "count": 57, - "num_samples": 880, - "tasks": [], - "average_score": 0.3068323820854221 - }, - "multiple_choice": { - "count": 33, - "num_samples": 567, - "tasks": [], - "average_score": 0.5064978792251521 - }, - "numerical_data": { - "count": 39, - "num_samples": 694, - "tasks": [], - "average_score": 0.31569247186288174 - }, - "open_ended_output": { - "count": 51, - "num_samples": 991, - "tasks": [], - "average_score": 0.39180263622429157 - }, - "structured_output": { - "count": 72, - "num_samples": 1121, - "tasks": [], - "average_score": 0.38908261098680974 - } - }, - "skills": { - "Commonsense and Social Reasoning": { - "count": 38, - "num_samples": 654, - "tasks": [], - "average_score": 0.4927960336040459 - }, - "Domain-Specific Knowledge and Skills": { - "count": 46, - "num_samples": 897, - "tasks": [], - "average_score": 0.31735419703044254 - }, - "Ethical and Safety Reasoning": { - "count": 10, - "num_samples": 170, - "tasks": [], - "average_score": 0.6569285714285714 - }, - "Language Understanding and Generation": { - "count": 102, - "num_samples": 1713, - "tasks": [], - "average_score": 0.41129100495999377 - }, - "Mathematical and Logical Reasoning": { - "count": 91, - "num_samples": 1630, - "tasks": [], - "average_score": 0.28799562910106935 - }, - "Object Recognition and Classification": { - "count": 172, - "num_samples": 2714, - "tasks": [], - "average_score": 0.3844930054666535 - }, - "Planning and Decision Making": { - "count": 23, - "num_samples": 356, - "tasks": [], - "average_score": 0.08484497782236566 - }, - "Scene and Event Understanding": { - "count": 60, - "num_samples": 1004, - "tasks": [], - "average_score": 0.5151962864568788 - }, - "Spatial and Temporal Reasoning": { - "count": 78, - "num_samples": 1273, - "tasks": [], - "average_score": 0.2780300019986884 - }, - "Text Recognition (OCR)": { - "count": 101, - "num_samples": 1687, - "tasks": [], - "average_score": 0.40533138482347386 - } - } - }, - "SmolVLM": { - "app": { - "Coding": { - "count": 16, - "num_samples": 244, - "tasks": [], - "average_score": 0.05390625 - }, - "Information_Extraction": { - "count": 41, - "num_samples": 644, - "tasks": [], - "average_score": 0.03906165844850793 - }, - "Knowledge": { - "count": 77, - "num_samples": 1294, - "tasks": [], - "average_score": 0.09639506190200878 - }, - "Mathematics": { - "count": 30, - "num_samples": 497, - "tasks": [], - "average_score": 0.06728619034079576 - }, - "Metrics": { - "count": 3, - "num_samples": 45, - "tasks": [], - "average_score": 0.2222222222222222 - }, - "Perception": { - "count": 82, - "num_samples": 1321, - "tasks": [], - "average_score": 0.1606753925138995 - }, - "Planning": { - "count": 44, - "num_samples": 714, - "tasks": [], - "average_score": 0.03272316763696074 - }, - "Science": { - "count": 22, - "num_samples": 469, - "tasks": [], - "average_score": 0.13950042461525716 - } - }, - "input_format": { - "3D Models and Aerial Imagery": { - "count": 2, - "num_samples": 30, - "tasks": [], - "average_score": 0.10013149786398344 - }, - "Artistic and Creative Content": { - "count": 22, - "num_samples": 389, - "tasks": [], - "average_score": 0.143657576543239 - }, - "Diagrams and Data Visualizations": { - "count": 88, - "num_samples": 1524, - "tasks": [], - "average_score": 0.0979843882877799 - }, - "Photographs": { - "count": 83, - "num_samples": 1315, - "tasks": [], - "average_score": 0.1383108182448921 - }, - "Text-Based Images and Documents": { - "count": 53, - "num_samples": 847, - "tasks": [], - "average_score": 0.09044016512537822 - }, - "User Interface Screenshots": { - "count": 67, - "num_samples": 1123, - "tasks": [], - "average_score": 0.029842216842698305 - } - }, - "input_num": { - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.09605051124900241 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 63, - "num_samples": 975, - "tasks": [], - "average_score": 0.12682789970863723 - }, - "exact_text": { - "count": 57, - "num_samples": 880, - "tasks": [], - "average_score": 0.05128016118728194 - }, - "multiple_choice": { - "count": 33, - "num_samples": 567, - "tasks": [], - "average_score": 0.10496742314924135 - }, - "numerical_data": { - "count": 39, - "num_samples": 694, - "tasks": [], - "average_score": 0.09999979828107199 - }, - "open_ended_output": { - "count": 51, - "num_samples": 991, - "tasks": [], - "average_score": 0.21315705831839693 - }, - "structured_output": { - "count": 72, - "num_samples": 1121, - "tasks": [], - "average_score": 0.015386904208215372 - } - }, - "skills": { - "Commonsense and Social Reasoning": { - "count": 38, - "num_samples": 654, - "tasks": [], - "average_score": 0.1293055688222371 - }, - "Domain-Specific Knowledge and Skills": { - "count": 46, - "num_samples": 897, - "tasks": [], - "average_score": 0.077851045512787 - }, - "Ethical and Safety Reasoning": { - "count": 10, - "num_samples": 170, - "tasks": [], - "average_score": 0.2222067669172932 - }, - "Language Understanding and Generation": { - "count": 102, - "num_samples": 1713, - "tasks": [], - "average_score": 0.12889143083611815 - }, - "Mathematical and Logical Reasoning": { - "count": 91, - "num_samples": 1630, - "tasks": [], - "average_score": 0.0865768026006882 - }, - "Object Recognition and Classification": { - "count": 172, - "num_samples": 2714, - "tasks": [], - "average_score": 0.10501451629704919 - }, - "Planning and Decision Making": { - "count": 23, - "num_samples": 356, - "tasks": [], - "average_score": 0.008178053830227744 - }, - "Scene and Event Understanding": { - "count": 60, - "num_samples": 1004, - "tasks": [], - "average_score": 0.12403047579230878 - }, - "Spatial and Temporal Reasoning": { - "count": 78, - "num_samples": 1273, - "tasks": [], - "average_score": 0.061765081348496016 - }, - "Text Recognition (OCR)": { - "count": 101, - "num_samples": 1687, - "tasks": [], - "average_score": 0.08610257462374318 - } - } - }, - "llava_onevision_72B": { - "app": { - "Coding": { - "count": 16, - "num_samples": 244, - "tasks": [], - "average_score": 0.3101241538281979 - }, - "Information_Extraction": { - "count": 41, - "num_samples": 644, - "tasks": [], - "average_score": 0.21993316800752236 - }, - "Knowledge": { - "count": 77, - "num_samples": 1294, - "tasks": [], - "average_score": 0.4073185744352188 - }, - "Mathematics": { - "count": 30, - "num_samples": 497, - "tasks": [], - "average_score": 0.30843360355217414 - }, - "Metrics": { - "count": 3, - "num_samples": 45, - "tasks": [], - "average_score": 0.4857142857142857 - }, - "Perception": { - "count": 82, - "num_samples": 1321, - "tasks": [], - "average_score": 0.4151635490932759 - }, - "Planning": { - "count": 44, - "num_samples": 714, - "tasks": [], - "average_score": 0.14332941205758537 - }, - "Science": { - "count": 22, - "num_samples": 469, - "tasks": [], - "average_score": 0.34229099411259356 - } - }, - "input_format": { - "3D Models and Aerial Imagery": { - "count": 2, - "num_samples": 30, - "tasks": [], - "average_score": 0.15000864315905132 - }, - "Artistic and Creative Content": { - "count": 22, - "num_samples": 389, - "tasks": [], - "average_score": 0.48700494939767686 - }, - "Diagrams and Data Visualizations": { - "count": 88, - "num_samples": 1524, - "tasks": [], - "average_score": 0.3420108320438131 - }, - "Photographs": { - "count": 83, - "num_samples": 1315, - "tasks": [], - "average_score": 0.46321361231985364 - }, - "Text-Based Images and Documents": { - "count": 53, - "num_samples": 847, - "tasks": [], - "average_score": 0.1991087184305048 - }, - "User Interface Screenshots": { - "count": 67, - "num_samples": 1123, - "tasks": [], - "average_score": 0.20630840715151963 - } - }, - "input_num": { - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.32994677641726666 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 63, - "num_samples": 975, - "tasks": [], - "average_score": 0.2595306800419483 - }, - "exact_text": { - "count": 57, - "num_samples": 880, - "tasks": [], - "average_score": 0.3154587757748795 - }, - "multiple_choice": { - "count": 33, - "num_samples": 567, - "tasks": [], - "average_score": 0.5216100397918579 - }, - "numerical_data": { - "count": 39, - "num_samples": 694, - "tasks": [], - "average_score": 0.29549573982348826 - }, - "open_ended_output": { - "count": 51, - "num_samples": 991, - "tasks": [], - "average_score": 0.3969569321996683 - }, - "structured_output": { - "count": 72, - "num_samples": 1121, - "tasks": [], - "average_score": 0.28638031668330033 - } - }, - "skills": { - "Commonsense and Social Reasoning": { - "count": 38, - "num_samples": 654, - "tasks": [], - "average_score": 0.49641793863653866 - }, - "Domain-Specific Knowledge and Skills": { - "count": 46, - "num_samples": 897, - "tasks": [], - "average_score": 0.34020787956522225 - }, - "Ethical and Safety Reasoning": { - "count": 10, - "num_samples": 170, - "tasks": [], - "average_score": 0.677251879699248 - }, - "Language Understanding and Generation": { - "count": 102, - "num_samples": 1713, - "tasks": [], - "average_score": 0.367151258145213 - }, - "Mathematical and Logical Reasoning": { - "count": 91, - "num_samples": 1630, - "tasks": [], - "average_score": 0.2882162928135965 - }, - "Object Recognition and Classification": { - "count": 172, - "num_samples": 2714, - "tasks": [], - "average_score": 0.35493339032346644 - }, - "Planning and Decision Making": { - "count": 23, - "num_samples": 356, - "tasks": [], - "average_score": 0.08886502118921868 - }, - "Scene and Event Understanding": { - "count": 60, - "num_samples": 1004, - "tasks": [], - "average_score": 0.49931032043437723 - }, - "Spatial and Temporal Reasoning": { - "count": 78, - "num_samples": 1273, - "tasks": [], - "average_score": 0.28423002295958694 - }, - "Text Recognition (OCR)": { - "count": 101, - "num_samples": 1687, - "tasks": [], - "average_score": 0.2705047345723313 - } - } - }, - "llava_onevision_7B": { - "app": { - "Coding": { - "count": 16, - "num_samples": 244, - "tasks": [], - "average_score": 0.20031585550887018 - }, - "Information_Extraction": { - "count": 41, - "num_samples": 644, - "tasks": [], - "average_score": 0.1340041159644947 - }, - "Knowledge": { - "count": 77, - "num_samples": 1294, - "tasks": [], - "average_score": 0.32565632074201306 - }, - "Mathematics": { - "count": 30, - "num_samples": 497, - "tasks": [], - "average_score": 0.19520567001898761 - }, - "Metrics": { - "count": 3, - "num_samples": 45, - "tasks": [], - "average_score": 0.5126984126984127 - }, - "Perception": { - "count": 82, - "num_samples": 1321, - "tasks": [], - "average_score": 0.3545352938542377 - }, - "Planning": { - "count": 44, - "num_samples": 714, - "tasks": [], - "average_score": 0.10542024755948716 - }, - "Science": { - "count": 22, - "num_samples": 469, - "tasks": [], - "average_score": 0.27440171167785654 - } - }, - "input_format": { - "3D Models and Aerial Imagery": { - "count": 2, - "num_samples": 30, - "tasks": [], - "average_score": 0.1783310257200802 - }, - "Artistic and Creative Content": { - "count": 22, - "num_samples": 389, - "tasks": [], - "average_score": 0.39584024260311845 - }, - "Diagrams and Data Visualizations": { - "count": 88, - "num_samples": 1524, - "tasks": [], - "average_score": 0.252511232938778 - }, - "Photographs": { - "count": 83, - "num_samples": 1315, - "tasks": [], - "average_score": 0.41346984169922946 - }, - "Text-Based Images and Documents": { - "count": 53, - "num_samples": 847, - "tasks": [], - "average_score": 0.1159417852705533 - }, - "User Interface Screenshots": { - "count": 67, - "num_samples": 1123, - "tasks": [], - "average_score": 0.1368238769607056 - } - }, - "input_num": { - "1-image": { - "count": 315, - "num_samples": 5228, - "tasks": [], - "average_score": 0.25687697499702805 - } - }, - "output_format": { - "contextual_formatted_text": { - "count": 63, - "num_samples": 975, - "tasks": [], - "average_score": 0.19203135933620985 - }, - "exact_text": { - "count": 57, - "num_samples": 880, - "tasks": [], - "average_score": 0.2490174433570946 - }, - "multiple_choice": { - "count": 33, - "num_samples": 567, - "tasks": [], - "average_score": 0.43553281735099914 - }, - "numerical_data": { - "count": 39, - "num_samples": 694, - "tasks": [], - "average_score": 0.22047389017098817 - }, - "open_ended_output": { - "count": 51, - "num_samples": 991, - "tasks": [], - "average_score": 0.3490743804978922 - }, - "structured_output": { - "count": 72, - "num_samples": 1121, - "tasks": [], - "average_score": 0.19236693222061413 - } - }, - "skills": { - "Commonsense and Social Reasoning": { - "count": 38, - "num_samples": 654, - "tasks": [], - "average_score": 0.4322205869643684 - }, - "Domain-Specific Knowledge and Skills": { - "count": 46, - "num_samples": 897, - "tasks": [], - "average_score": 0.24367762339842414 - }, - "Ethical and Safety Reasoning": { - "count": 10, - "num_samples": 170, - "tasks": [], - "average_score": 0.5779849624060149 - }, - "Language Understanding and Generation": { - "count": 102, - "num_samples": 1713, - "tasks": [], - "average_score": 0.28693734738201987 - }, - "Mathematical and Logical Reasoning": { - "count": 91, - "num_samples": 1630, - "tasks": [], - "average_score": 0.19593817255686638 - }, - "Object Recognition and Classification": { - "count": 172, - "num_samples": 2714, - "tasks": [], - "average_score": 0.292593666904816 - }, - "Planning and Decision Making": { - "count": 23, - "num_samples": 356, - "tasks": [], - "average_score": 0.07666140459493773 - }, - "Scene and Event Understanding": { - "count": 60, - "num_samples": 1004, - "tasks": [], - "average_score": 0.44333006096492455 - }, - "Spatial and Temporal Reasoning": { - "count": 78, - "num_samples": 1273, - "tasks": [], - "average_score": 0.2134151671467958 - }, - "Text Recognition (OCR)": { - "count": 101, - "num_samples": 1687, - "tasks": [], - "average_score": 0.19363816536239586 - } - } - } -} \ No newline at end of file diff --git a/static/eval_results/SI/all_summary.json b/static/eval_results/SI/all_summary.json deleted file mode 100644 index f1fa085799256ca76153f1797c742c435bffb125..0000000000000000000000000000000000000000 --- a/static/eval_results/SI/all_summary.json +++ /dev/null @@ -1,509 +0,0 @@ -{ - "Aquila_VL_2B": { - "core": { - "num_eval_tasks": 273, - "num_eval_samples": 4116, - "num_not_eval_samples": 0, - "macro_mean_score": 0.20770364903712493, - "micro_mean_score": 0.20333142638522636, - "missing_tasks": [] - }, - "open": { - "num_eval_tasks": 42, - "num_eval_samples": 813, - "num_not_eval_samples": 0, - "macro_mean_score": 0.31474202723571276, - "micro_mean_score": 0.3326568265682657, - "missing_tasks": [] - }, - "overall_score": 0.22197543279693666 - }, - "Aria": { - "core": { - "num_eval_tasks": 273, - "num_eval_samples": 4116, - "num_not_eval_samples": 0, - "macro_mean_score": 0.3178882776147889, - "micro_mean_score": 0.3101511832828904, - "missing_tasks": [] - }, - "open": { - "num_eval_tasks": 42, - "num_eval_samples": 813, - "num_not_eval_samples": 0, - "macro_mean_score": 0.5137437248005172, - "micro_mean_score": 0.5472939729397295, - "missing_tasks": [] - }, - "overall_score": 0.34400233723955265 - }, - "Claude_3.5": { - "core": { - "num_eval_tasks": 273, - "num_eval_samples": 4116, - "num_not_eval_samples": 0, - "macro_mean_score": 0.520276385877485, - "micro_mean_score": 0.5148202137998056 - }, - "open": { - "num_eval_tasks": 42, - "num_eval_samples": 813, - "num_not_eval_samples": 0, - "macro_mean_score": 0.6479684260295507, - "micro_mean_score": 0.6801968019680197 - }, - "overall_score": 0.5373019912310938 - }, - "Claude_3.5_new": { - "core": { - "num_eval_tasks": 273, - "num_eval_samples": 4116, - "num_not_eval_samples": 0, - "macro_mean_score": 0.5462752278980763, - "micro_mean_score": 0.5417881438289601 - }, - "open": { - "num_eval_tasks": 42, - "num_eval_samples": 813, - "num_not_eval_samples": 0, - "macro_mean_score": 0.6764020657053476, - "micro_mean_score": 0.6924969249692496 - }, - "overall_score": 0.5636254729390457 - }, - "GPT_4o": { - "core": { - "num_eval_tasks": 273, - "num_eval_samples": 4116, - "num_not_eval_samples": 0, - "macro_mean_score": 0.5529953662872719, - "micro_mean_score": 0.5483479105928085 - }, - "open": { - "num_eval_tasks": 42, - "num_eval_samples": 813, - "num_not_eval_samples": 0, - "macro_mean_score": 0.6600228904804206, - "micro_mean_score": 0.6801968019680197 - }, - "overall_score": 0.5672657028463584 - }, - "GPT_4o_mini": { - "core": { - "num_eval_tasks": 273, - "num_eval_samples": 4116, - "num_not_eval_samples": 0, - "macro_mean_score": 0.4431039098921726, - "micro_mean_score": 0.43780369290573373 - }, - "open": { - "num_eval_tasks": 42, - "num_eval_samples": 813, - "num_not_eval_samples": 0, - "macro_mean_score": 0.595574663769726, - "micro_mean_score": 0.6334563345633456 - }, - "overall_score": 0.46343334374251305 - }, - "Gemini_1.5_flash_002": { - "core": { - "num_eval_tasks": 273, - "num_eval_samples": 4116, - "num_not_eval_samples": 0, - "macro_mean_score": 0.43481964330318734, - "micro_mean_score": 0.4297862001943635 - }, - "open": { - "num_eval_tasks": 42, - "num_eval_samples": 813, - "num_not_eval_samples": 0, - "macro_mean_score": 0.5787083135236054, - "micro_mean_score": 0.6186961869618696 - }, - "overall_score": 0.4540047993325765 - }, - "Gemini_1.5_pro_002": { - "core": { - "num_eval_tasks": 273, - "num_eval_samples": 4116, - "num_not_eval_samples": 0, - "macro_mean_score": 0.4914311038229404, - "micro_mean_score": 0.48323615160349853 - }, - "open": { - "num_eval_tasks": 42, - "num_eval_samples": 813, - "num_not_eval_samples": 0, - "macro_mean_score": 0.5814975405131552, - "micro_mean_score": 0.6174661746617466 - }, - "overall_score": 0.5034399620483024 - }, - "Idefics3": { - "core": { - "num_eval_tasks": 273, - "num_eval_samples": 4116, - "num_not_eval_samples": 0, - "macro_mean_score": 0.08941182847569326, - "micro_mean_score": 0.08779475233900695, - "missing_tasks": [] - }, - "open": { - "num_eval_tasks": 42, - "num_eval_samples": 813, - "num_not_eval_samples": 0, - "macro_mean_score": 0.3231434267517844, - "micro_mean_score": 0.3618081180811809, - "missing_tasks": [] - }, - "overall_score": 0.12057604157917208 - }, - "InternVL2_2B": { - "core": { - "num_eval_tasks": 273, - "num_eval_samples": 4116, - "num_not_eval_samples": 0, - "macro_mean_score": 0.12069001041308772, - "micro_mean_score": 0.11842605219090299, - "missing_tasks": [] - }, - "open": { - "num_eval_tasks": 42, - "num_eval_samples": 813, - "num_not_eval_samples": 0, - "macro_mean_score": 0.28522459992910454, - "micro_mean_score": 0.28886838868388687, - "missing_tasks": [] - }, - "overall_score": 0.14262795568189 - }, - "InternVL2_76B": { - "core": { - "num_eval_tasks": 273, - "num_eval_samples": 4116, - "num_not_eval_samples": 0, - "macro_mean_score": 0.3998616568018755, - "micro_mean_score": 0.39149064302628933, - "missing_tasks": [] - }, - "open": { - "num_eval_tasks": 42, - "num_eval_samples": 813, - "num_not_eval_samples": 0, - "macro_mean_score": 0.554748737158244, - "micro_mean_score": 0.5800738007380073, - "missing_tasks": [] - }, - "overall_score": 0.42051326751605805 - }, - "InternVL2_8B": { - "core": { - "num_eval_tasks": 273, - "num_eval_samples": 4116, - "num_not_eval_samples": 0, - "macro_mean_score": 0.27650612401825575, - "micro_mean_score": 0.27119471729837735, - "missing_tasks": [] - }, - "open": { - "num_eval_tasks": 42, - "num_eval_samples": 813, - "num_not_eval_samples": 0, - "macro_mean_score": 0.39388373890935635, - "micro_mean_score": 0.4045510455104551, - "missing_tasks": [] - }, - "overall_score": 0.29215647267040246 - }, - "Llama_3_2_11B": { - "core": { - "num_eval_tasks": 273, - "num_eval_samples": 4116, - "num_not_eval_samples": 0, - "macro_mean_score": 0.20789144960796493, - "micro_mean_score": 0.20163641703273802, - "missing_tasks": [] - }, - "open": { - "num_eval_tasks": 42, - "num_eval_samples": 813, - "num_not_eval_samples": 0, - "macro_mean_score": 0.3861125858565788, - "micro_mean_score": 0.4130381303813038, - "missing_tasks": [] - }, - "overall_score": 0.2316542677744468 - }, - "MiniCPM_v2.6": { - "core": { - "num_eval_tasks": 273, - "num_eval_samples": 4116, - "num_not_eval_samples": 0, - "macro_mean_score": 0.23230765810722817, - "micro_mean_score": 0.22684118052665975, - "missing_tasks": [] - }, - "open": { - "num_eval_tasks": 42, - "num_eval_samples": 813, - "num_not_eval_samples": 0, - "macro_mean_score": 0.4360655066213874, - "micro_mean_score": 0.4588560885608856, - "missing_tasks": [] - }, - "overall_score": 0.2594753712424494 - }, - "Molmo_72B": { - "core": { - "num_eval_tasks": 270, - "num_eval_samples": 4073, - "num_not_eval_samples": 0, - "macro_mean_score": 0.36480000609384927, - "micro_mean_score": 0.36205779758110807, - "missing_tasks": [ - "table_understanding", - "MMSoc_Misinformation_PolitiFact", - "planning_screenshot_termes" - ] - }, - "open": { - "num_eval_tasks": 42, - "num_eval_samples": 813, - "num_not_eval_samples": 0, - "macro_mean_score": 0.4465682063915481, - "micro_mean_score": 0.4850553505535054, - "missing_tasks": [] - }, - "overall_score": 0.3758072638262318 - }, - "Molmo_7B_D": { - "core": { - "num_eval_tasks": 272, - "num_eval_samples": 4102, - "num_not_eval_samples": 0, - "macro_mean_score": 0.2098088446992518, - "micro_mean_score": 0.20550929661464645, - "missing_tasks": [ - "MMSoc_Misinformation_PolitiFact" - ] - }, - "open": { - "num_eval_tasks": 42, - "num_eval_samples": 813, - "num_not_eval_samples": 0, - "macro_mean_score": 0.35697926179118733, - "micro_mean_score": 0.38936039360393604, - "missing_tasks": [] - }, - "overall_score": 0.22949405972428777 - }, - "NVLM": { - "core": { - "num_eval_tasks": 273, - "num_eval_samples": 4116, - "num_not_eval_samples": 0, - "macro_mean_score": 0.32989872890926025, - "micro_mean_score": 0.32315683713111915, - "missing_tasks": [] - }, - "open": { - "num_eval_tasks": 42, - "num_eval_samples": 813, - "num_not_eval_samples": 0, - "macro_mean_score": 0.4469349818134809, - "micro_mean_score": 0.4881303813038132, - "missing_tasks": [] - }, - "overall_score": 0.34550356262982296 - }, - "POINTS_15_7B": { - "core": { - "num_eval_tasks": 273, - "num_eval_samples": 4116, - "num_not_eval_samples": 0, - "macro_mean_score": 0.31355970638319003, - "micro_mean_score": 0.30728203432446294, - "missing_tasks": [] - }, - "open": { - "num_eval_tasks": 42, - "num_eval_samples": 813, - "num_not_eval_samples": 0, - "macro_mean_score": 0.41331219301389166, - "micro_mean_score": 0.42749077490774917, - "missing_tasks": [] - }, - "overall_score": 0.32686003793395024 - }, - "POINTS_7B": { - "core": { - "num_eval_tasks": 273, - "num_eval_samples": 4116, - "num_not_eval_samples": 0, - "macro_mean_score": 0.25511317681632334, - "micro_mean_score": 0.24927711632415062, - "missing_tasks": [] - }, - "open": { - "num_eval_tasks": 42, - "num_eval_samples": 813, - "num_not_eval_samples": 0, - "macro_mean_score": 0.30315625179016, - "micro_mean_score": 0.3313653136531366, - "missing_tasks": [] - }, - "overall_score": 0.26151892014616823 - }, - "Phi-3.5-vision": { - "core": { - "num_eval_tasks": 273, - "num_eval_samples": 4116, - "num_not_eval_samples": 0, - "macro_mean_score": 0.2561274958722834, - "micro_mean_score": 0.2504214576875906, - "missing_tasks": [] - }, - "open": { - "num_eval_tasks": 42, - "num_eval_samples": 813, - "num_not_eval_samples": 0, - "macro_mean_score": 0.4272267419054576, - "micro_mean_score": 0.445879458794588, - "missing_tasks": [] - }, - "overall_score": 0.2789407286767066 - }, - "Pixtral_12B": { - "core": { - "num_eval_tasks": 273, - "num_eval_samples": 4116, - "num_not_eval_samples": 0, - "macro_mean_score": 0.3436942439614412, - "micro_mean_score": 0.3373564384613738, - "missing_tasks": [] - }, - "open": { - "num_eval_tasks": 42, - "num_eval_samples": 813, - "num_not_eval_samples": 0, - "macro_mean_score": 0.4417271955536318, - "micro_mean_score": 0.4845633456334564, - "missing_tasks": [] - }, - "overall_score": 0.3567653041737333 - }, - "Qwen2_VL_2B": { - "core": { - "num_eval_tasks": 273, - "num_eval_samples": 4116, - "num_not_eval_samples": 0, - "macro_mean_score": 0.22787906973244856, - "micro_mean_score": 0.2234748515064842, - "missing_tasks": [] - }, - "open": { - "num_eval_tasks": 42, - "num_eval_samples": 813, - "num_not_eval_samples": 0, - "macro_mean_score": 0.3509364634962041, - "micro_mean_score": 0.3768757687576875, - "missing_tasks": [] - }, - "overall_score": 0.24428672223428263 - }, - "Qwen2_VL_72B": { - "core": { - "num_eval_tasks": 273, - "num_eval_samples": 4116, - "num_not_eval_samples": 0, - "macro_mean_score": 0.4730536307784527, - "micro_mean_score": 0.4659830915476831, - "missing_tasks": [] - }, - "open": { - "num_eval_tasks": 42, - "num_eval_samples": 813, - "num_not_eval_samples": 0, - "macro_mean_score": 0.5510079982505317, - "micro_mean_score": 0.5826568265682657, - "missing_tasks": [] - }, - "overall_score": 0.48344754644139654 - }, - "Qwen2_VL_7B": { - "core": { - "num_eval_tasks": 273, - "num_eval_samples": 4116, - "num_not_eval_samples": 0, - "macro_mean_score": 0.3538656561495699, - "micro_mean_score": 0.34581250459157137, - "missing_tasks": [] - }, - "open": { - "num_eval_tasks": 42, - "num_eval_samples": 813, - "num_not_eval_samples": 0, - "macro_mean_score": 0.4517429592549692, - "micro_mean_score": 0.4730012300123002, - "missing_tasks": [] - }, - "overall_score": 0.3669159632302898 - }, - "SmolVLM": { - "core": { - "num_eval_tasks": 273, - "num_eval_samples": 4116, - "num_not_eval_samples": 0, - "macro_mean_score": 0.07348385181460795, - "micro_mean_score": 0.0732694668402814, - "missing_tasks": [] - }, - "open": { - "num_eval_tasks": 42, - "num_eval_samples": 813, - "num_not_eval_samples": 0, - "macro_mean_score": 0.2427337975725658, - "micro_mean_score": 0.2504920049200492, - "missing_tasks": [] - }, - "overall_score": 0.09605051124900234 - }, - "llava_onevision_72B": { - "core": { - "num_eval_tasks": 273, - "num_eval_samples": 4116, - "num_not_eval_samples": 0, - "macro_mean_score": 0.312618242621264, - "micro_mean_score": 0.3098623876487132, - "missing_tasks": [] - }, - "open": { - "num_eval_tasks": 42, - "num_eval_samples": 813, - "num_not_eval_samples": 0, - "macro_mean_score": 0.4425822460912829, - "micro_mean_score": 0.47539975399754, - "missing_tasks": [] - }, - "overall_score": 0.32994677641726655 - }, - "llava_onevision_7B": { - "core": { - "num_eval_tasks": 273, - "num_eval_samples": 4116, - "num_not_eval_samples": 0, - "macro_mean_score": 0.23683339637631812, - "micro_mean_score": 0.23283041278687175, - "missing_tasks": [] - }, - "open": { - "num_eval_tasks": 42, - "num_eval_samples": 813, - "num_not_eval_samples": 0, - "macro_mean_score": 0.3871602360316429, - "micro_mean_score": 0.4113161131611316, - "missing_tasks": [] - }, - "overall_score": 0.25687697499702805 - } -} \ No newline at end of file diff --git a/static/eval_results/SI/llava_onevision_72B/summary_results.json b/static/eval_results/SI/llava_onevision_72B/summary_results.json new file mode 100644 index 0000000000000000000000000000000000000000..c67b592ab42608c0b29d97657f792f219073bd8b --- /dev/null +++ b/static/eval_results/SI/llava_onevision_72B/summary_results.json @@ -0,0 +1,219 @@ +{ + "model_summary": { + "core": { + "num_eval_tasks": 273, + "num_eval_samples": 4116, + "num_not_eval_samples": 0, + "macro_mean_score": 0.312618242621264, + "micro_mean_score": 0.3098623876487132, + "missing_tasks": [] + }, + "open": { + "num_eval_tasks": 42, + "num_eval_samples": 813, + "num_not_eval_samples": 0, + "macro_mean_score": 0.4425822460912829, + "micro_mean_score": 0.47539975399754, + "missing_tasks": [] + }, + "overall_score": 0.32994677641726655 + }, + "keyword_stats": { + "skills": { + "Text Recognition (OCR)": { + "count": 101, + "num_samples": 1687, + "tasks": [], + "average_score": 0.2705047345723313 + }, + "Object Recognition and Classification": { + "count": 172, + "num_samples": 2714, + "tasks": [], + "average_score": 0.35493339032346644 + }, + "Scene and Event Understanding": { + "count": 60, + "num_samples": 1004, + "tasks": [], + "average_score": 0.49931032043437723 + }, + "Mathematical and Logical Reasoning": { + "count": 91, + "num_samples": 1630, + "tasks": [], + "average_score": 0.2882162928135965 + }, + "Language Understanding and Generation": { + "count": 102, + "num_samples": 1713, + "tasks": [], + "average_score": 0.367151258145213 + }, + "Domain-Specific Knowledge and Skills": { + "count": 46, + "num_samples": 897, + "tasks": [], + "average_score": 0.34020787956522225 + }, + "Spatial and Temporal Reasoning": { + "count": 78, + "num_samples": 1273, + "tasks": [], + "average_score": 0.28423002295958694 + }, + "Planning and Decision Making": { + "count": 23, + "num_samples": 356, + "tasks": [], + "average_score": 0.08886502118921868 + }, + "Commonsense and Social Reasoning": { + "count": 38, + "num_samples": 654, + "tasks": [], + "average_score": 0.49641793863653866 + }, + "Ethical and Safety Reasoning": { + "count": 10, + "num_samples": 170, + "tasks": [], + "average_score": 0.677251879699248 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 67, + "num_samples": 1123, + "tasks": [], + "average_score": 0.20630840715151963 + }, + "Text-Based Images and Documents": { + "count": 53, + "num_samples": 847, + "tasks": [], + "average_score": 0.1991087184305048 + }, + "Photographs": { + "count": 83, + "num_samples": 1315, + "tasks": [], + "average_score": 0.46321361231985364 + }, + "Diagrams and Data Visualizations": { + "count": 88, + "num_samples": 1524, + "tasks": [], + "average_score": 0.3420108320438131 + }, + "Artistic and Creative Content": { + "count": 22, + "num_samples": 389, + "tasks": [], + "average_score": 0.48700494939767686 + }, + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.15000864315905132 + } + }, + "output_format": { + "structured_output": { + "count": 72, + "num_samples": 1121, + "tasks": [], + "average_score": 0.28638031668330033 + }, + "contextual_formatted_text": { + "count": 63, + "num_samples": 975, + "tasks": [], + "average_score": 0.2595306800419483 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.3154587757748795 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.29549573982348826 + }, + "open_ended_output": { + "count": 51, + "num_samples": 991, + "tasks": [], + "average_score": 0.3969569321996683 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.5216100397918579 + } + }, + "input_num": { + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.32994677641726666 + } + }, + "app": { + "Information_Extraction": { + "count": 41, + "num_samples": 644, + "tasks": [], + "average_score": 0.21993316800752236 + }, + "Planning": { + "count": 44, + "num_samples": 714, + "tasks": [], + "average_score": 0.14332941205758537 + }, + "Coding": { + "count": 16, + "num_samples": 244, + "tasks": [], + "average_score": 0.3101241538281979 + }, + "Perception": { + "count": 82, + "num_samples": 1321, + "tasks": [], + "average_score": 0.4151635490932759 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.4857142857142857 + }, + "Science": { + "count": 22, + "num_samples": 469, + "tasks": [], + "average_score": 0.34229099411259356 + }, + "Knowledge": { + "count": 77, + "num_samples": 1294, + "tasks": [], + "average_score": 0.4073185744352188 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.30843360355217414 + } + } + } +} \ No newline at end of file diff --git a/static/eval_results/SI/llava_onevision_72B/task_results.json b/static/eval_results/SI/llava_onevision_72B/task_results.json new file mode 100644 index 0000000000000000000000000000000000000000..2e64e10176bf69dd6c2f3cd47aa2b99e41d457ad --- /dev/null +++ b/static/eval_results/SI/llava_onevision_72B/task_results.json @@ -0,0 +1,2207 @@ +[ + { + "name": "science_molecule_chemistry", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "signboard_identification", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "funsd_document_qa", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "physical_property_reasoning", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_area", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_analytic", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "3d_indoor_scene_text_bbox_selection", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "map_diagram_qa", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_connectivity", + "score": 0.21666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "graph_isomorphism", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "question_solution_solving", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "snli_ve_visual_entailment", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "ti_fused_vqa_chemistry", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ili_ratio_future_prediction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "image_style_recognition", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "multilingual_movie_info_parsing", + "score": 0.08163265306122448, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_transformation", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "vibe_eval_short_phrase", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "math_breakpoint", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "waybill_number_sequence_extraction", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "brand_logo_recognition_and_elaboration", + "score": 0.76, + "eval_type": "rule", + "num_demo": 1, + "num_query": 25 + }, + { + "name": "license_plate_recognition", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_puzzle_single_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "code_programming_test_easy", + "score": 0.20833333333333334, + "eval_type": "rule", + "num_demo": 1, + "num_query": 24 + }, + { + "name": "chess_winner_identification", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "chart_vqa", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "math_convexity_value_estimation", + "score": 0.4892773511051451, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "3d_indoor_scene_text_bbox_prediction", + "score": 0.08573157203238838, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "long_string_letter_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "dvqa", + "score": 0.42105263157894735, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "egocentric_spatial_reasoning", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "quizlet_question_solving", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "coco_person_detection", + "score": 0.5480340058696004, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_maxflow", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "tqa_textbook_qa", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "game_info_parsing", + "score": 0.35064935064935066, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "animal_pose_estimation", + "score": 0.2657477500954247, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "humor_understand_caption_match", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "table_understanding", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "super_clevr", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "average_humidity_estimate_plot", + "score": 0.6279999999999999, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "nlvr2_two_image_compare_qa", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "face_keypoint_detection", + "score": 0.6010216430095937, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "insect_order_classification", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "deciphering_oracle_bone", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "places365_scene_type_classification", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "science_basic_physics", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "long_string_number_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "handwritten_math_expression_extraction", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "newspaper_ocr_in_query_box", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "graph_theory", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "landmark_recognition_and_qa", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "geometry_solid", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "top_rated_hotel_identification", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "stock_info_parsing", + "score": 0.12184873949579833, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pmc_vqa_medical_image_qa", + "score": 0.631578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "electricity_future_prediction_from_table", + "score": 0.36842105263157904, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "TV_show_info_parsing", + "score": 0.2380952380952381, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "coco_object_detection_by_query_property", + "score": 0.5495762050605968, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "figureqa", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ti_fused_vqa_biology", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "traffic_future_prediction_from_line_plot", + "score": 0.43773684210526315, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "mvsa_sentiment_classification", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "stock_price_future_prediction", + "score": 0.4885714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "kvqa_knowledge_aware_qa", + "score": 0.47368421052631576, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "hotel_booking_confirmation_parsing", + "score": 0.0642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_shortest_path_planar", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "math_parity", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "famous_building_recognition", + "score": 0.84375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "multilingual_game_info_parsing", + "score": 0.125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "weather_info_parsing", + "score": 0.40873015873015867, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_shortest_path_kamada_kawai", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "widerface_face_count_and_event_classification", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ti_fused_vqa_physics", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_chordless_cycle", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_length", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "algebra", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "human_relationship_reasoning", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "exchange_rate_estimate_plot", + "score": 0.7666714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "electricity_load_estimate_plot", + "score": 0.4807857142857144, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "single_person_pose_estimation", + "score": 0.2502288590795625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ti_fused_vqa_math", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "electricity_plot_future_prediction", + "score": 0.6293368421052632, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "iconqa", + "score": 0.5263157894736842, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "movie_info_parsing", + "score": 0.16964285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "youtube_video_info_parsing", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "egocentric_analysis_single_image", + "score": 0.2222222222222222, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "music_info_parsing", + "score": 0.33035714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mnist_pattern", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "newspaper_page_parse_and_count", + "score": 0.5777777777777777, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "geometry_descriptive", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_relation", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_depth", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_distance", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_count", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "symbolic_graphics_programs_computer_aided_design", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "symbolic_graphics_programs_scalable_vector_graphics", + "score": 0.1111111111111111, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18 + }, + { + "name": "multiple_states_identify_asia", + "score": 0.05714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_americas", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_europe", + "score": 0.12857142857142856, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_africa", + "score": 0.014285714285714287, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "flowchart_code_generation", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "healthcare_info_judgement", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "relative_depth_of_different_points", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_visual_storage", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "Ad_count_detection", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_sygyzy_endgames", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "web_action_grounding", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "position_relationship", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "GUI_Act_Web_Multi", + "score": 0.4642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "webpage_code_understanding", + "score": 0.7777777777777778, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "extract_webpage_headline", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "music_sheet_note_count", + "score": 0.058823529411764705, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "geometry_reasoning_circled_letter", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "logical_reasoning_find_odd_one_out", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "paper_vqa", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "remaining_playback_time_calculation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "interpret_force_perspective_illusion", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "GUI_Act_Web_Single", + "score": 0.009120898057654902, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "hashtag_recommendation", + "score": 0.8761904761904761, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_tyreworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "weather_map_climate_type_temperature_parsing", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "scibench_calculus_wo_solution", + "score": 0.20408163265306123, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49 + }, + { + "name": "MMMU_physics_chemistry_MCQ", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "image_translation_en2cn", + "score": 0.18336944138598113, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "game_platform_support_identification", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "music_sheet_format_QA", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cultural_vqa", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "CLEVRER_physics", + "score": 0.3, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20 + }, + { + "name": "location_vqa", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_nested_squares", + "score": 0.4642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "circuit_diagram_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "scibench_fundamental_wo_solution", + "score": 0.10204081632653061, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49 + }, + { + "name": "medical_cell_recognition", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "planning_screenshot_blocksworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "planning_screenshot_storage", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "distinguish_ai_generated_image", + "score": 0.7894736842105263, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "geometry_reasoning_count_line_intersections", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_grippers", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "llavaguard", + "score": 0.6071428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mindmap_elements_parsing", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "autonomous_driving_scene_analysis", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "realworld_qa_en2cn", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "orchestra_score_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "highest_discount_game_price_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "counting", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "media_QA_web_stackoverflow", + "score": 0.4523809523809524, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "soccer_offside", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "song_title_identification_from_lyrics", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMMU_pro_exam_screenshot", + "score": 0.20202020202020202, + "eval_type": "rule", + "num_demo": 1, + "num_query": 99 + }, + { + "name": "medical_multi_organ_segmentation_rater", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "knowledge_graph_understanding", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "mensa_iq_test", + "score": 0.2843137254901961, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "play_go_capture_stone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "medical_blood_vessels_recognition", + "score": 0.6071428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "web_action_prediction", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "arxiv_vqa", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "top_video_creator_identification", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "annoying_word_search", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "GUI_Act_Mobile_tap", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_grid", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "GUI_Act_Mobile_swipe", + "score": 0.6034155146542303, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "recover_masked_word_in_figure", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "character_recognition_in_TV_shows", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "relative_reflectance_of_different_regions", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_image_artifacts_indentification", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "signage_navigation", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "transit_map_intersection_points", + "score": 0.3027210884353741, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "code_execution", + "score": 0.1875, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "icon_arithmetic_puzzle", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_overlapped_circle", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chinese_idiom_recognition", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "monthly_weather_days_count", + "score": 0.16666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "calendar_schedule_suggestion", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ascii_art_understanding", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_find_legal_moves", + "score": 0.021835265116335066, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "topological_sort", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "entertainment_web_game_style", + "score": 0.6071428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "polygon_interior_angles", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mahjong", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "font_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "number_comparison", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_counting_lymphocytes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "actor_recognition_in_Movie", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "product_ocr_qa", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "Bongard_Problem", + "score": 0.18421052631578946, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "logical_reasoning_fit_pattern", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "road_map_find_highway_between_two_place", + "score": 0.5882352941176471, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "ishihara_test", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "LaTeX_complex_formula_convertion", + "score": 0.17647058823529413, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "rebus", + "score": 0.21739130434782608, + "eval_type": "rule", + "num_demo": 1, + "num_query": 23 + }, + { + "name": "constrained_generation_contain_position_length", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_repeat_length", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_contain_length", + "score": 0.8666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_length", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "chess_puzzles_equality", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "chess_puzzles_crushing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_puzzles_checkmate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "panel_images_single_question", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "panel_images_multi_question", + "score": 0.6904761904761906, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "crossword_mini_5x5", + "score": 0.028571428571428574, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_HatefulMemes", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Misinformation_GossipCop", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Misinformation_PolitiFact", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Memotion", + "score": 0.6000000000000001, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "reward_models_I2T_reward", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_MATH", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "ocr_math_TheoremQA", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "counterfactual_arithmetic", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_homepage", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_publication", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_blogpost", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_complex_question_answering", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_fact_verification", + "score": 0.6666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_face_swap", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_text_style", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_clip_stable_diffusion_generate", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_veracity", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_face_attribute_edit", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_text_entity_replace", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_out_of_context", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_background_change", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "number_puzzle_kakuro_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "number_puzzle_sudoku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "maze_2d_8x8", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "shape_composition_shapes", + "score": 0.41913265306122444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "shape_composition_colours", + "score": 0.31428571428571433, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_chinese_celebrity", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_indian_celebrity", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_papers", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "memorization_famous_treaty", + "score": 0.39285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_csv", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_school_plain", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_text_latex", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_skill_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_experience_plain", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_article_authors", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_markdown", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_html", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_employer_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_equation", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_article_journal", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_latex", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "autorater_artifact", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "autorater_artifact_reason", + "score": 0.6, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_haiku", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_acrostic", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_limerick", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_petrarchian_sonnet_optional_meter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_custom_rhyming_scheme", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_acrostic_alliteration", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_shakespearean_sonnet", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "pictionary_cartoon_drawing_guess", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_doodle_guess", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "pictionary_skribbl_io", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20 + }, + { + "name": "pictionary_genai_output_chinese", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_chinese_food_img2en", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_tiktok", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_notes", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_alipay", + "score": 0.4117647058823529, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "app_layout_understanding_amazon", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_instagram", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_ppt", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_leetcode", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_twitter", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_zoom", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "app_layout_understanding_iphone_settings", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_youtube", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_excel", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_word", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "scibench_w_solution_open_ended", + "score": 0.23999999999999996, + "eval_type": "llm", + "num_demo": 1, + "num_query": 25 + }, + { + "name": "electrocardiogram", + "score": 0.27142857142857146, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "funny_image_title", + "score": 0.7071428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "image_captioning_with_additional_requirements", + "score": 0.8357142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "meme_explain", + "score": 0.6500000000000001, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "image_humor_understanding", + "score": 0.6965517241379311, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "tweets_captioning", + "score": 0.49999999999999994, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "defeasible_reasoning", + "score": 0.7241379310344828, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "vibe-eval", + "score": 0.4928571428571429, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocrqa", + "score": 0.6413793103448276, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "figurative_speech_explanation", + "score": 0.710344827586207, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "docci_image_description_long", + "score": 0.6642857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bar_chart_interpretation", + "score": 0.5413793103448277, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "iq_test", + "score": 0.18620689655172415, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "unusual_images", + "score": 0.7241379310344829, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "GUI_Chat_Hard", + "score": 0.3548387096774194, + "eval_type": "llm", + "num_demo": 1, + "num_query": 31 + }, + { + "name": "graph_interpretation", + "score": 0.4482758620689657, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "traffic_accident_analysis", + "score": 0.5714285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "humor_explanation", + "score": 0.28666666666666657, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "GUI_Chat_Easy", + "score": 0.6230769230769231, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26 + }, + { + "name": "table2latex_complex", + "score": 0.38888888888888884, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "visualization_with_code", + "score": 0.35, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "science_figure_explanation", + "score": 0.6689655172413793, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "guess_image_generation_prompt", + "score": 0.8052631578947368, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "art_explanation", + "score": 0.5517241379310345, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "bridge_strategies_worldclass", + "score": 0.06428571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bridge_strategies_expert", + "score": 0.3928571428571429, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bridge_strategies_advanced", + "score": 0.15, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Spanish", + "score": 0.028571428571428574, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Arabic", + "score": 0.10714285714285714, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Japanese", + "score": 0.07857142857142856, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Russian", + "score": 0.049999999999999996, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_English", + "score": 0.06428571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_French", + "score": 0.07857142857142856, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_fetaqa", + "score": 0.4785714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "red_teaming_racial", + "score": 0.565, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_captcha", + "score": 0.11052631578947371, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "red_teaming_celebrity", + "score": 0.535, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_politics", + "score": 0.6349999999999999, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_visualmisleading", + "score": 0.8789473684210528, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "red_teaming_jailbreak", + "score": 0.665, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "ascii_art_30", + "score": 0.07142857142857142, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + } +] \ No newline at end of file diff --git a/static/eval_results/SI/llava_onevision_7B/summary_results.json b/static/eval_results/SI/llava_onevision_7B/summary_results.json new file mode 100644 index 0000000000000000000000000000000000000000..0a8707a14e183e8e3fb3dbb2232b4d1fca07b301 --- /dev/null +++ b/static/eval_results/SI/llava_onevision_7B/summary_results.json @@ -0,0 +1,219 @@ +{ + "model_summary": { + "core": { + "num_eval_tasks": 273, + "num_eval_samples": 4116, + "num_not_eval_samples": 0, + "macro_mean_score": 0.23683339637631812, + "micro_mean_score": 0.23283041278687175, + "missing_tasks": [] + }, + "open": { + "num_eval_tasks": 42, + "num_eval_samples": 813, + "num_not_eval_samples": 0, + "macro_mean_score": 0.3871602360316429, + "micro_mean_score": 0.4113161131611316, + "missing_tasks": [] + }, + "overall_score": 0.25687697499702805 + }, + "keyword_stats": { + "skills": { + "Text Recognition (OCR)": { + "count": 101, + "num_samples": 1687, + "tasks": [], + "average_score": 0.19363816536239586 + }, + "Object Recognition and Classification": { + "count": 172, + "num_samples": 2714, + "tasks": [], + "average_score": 0.292593666904816 + }, + "Scene and Event Understanding": { + "count": 60, + "num_samples": 1004, + "tasks": [], + "average_score": 0.44333006096492455 + }, + "Mathematical and Logical Reasoning": { + "count": 91, + "num_samples": 1630, + "tasks": [], + "average_score": 0.19593817255686638 + }, + "Language Understanding and Generation": { + "count": 102, + "num_samples": 1713, + "tasks": [], + "average_score": 0.28693734738201987 + }, + "Domain-Specific Knowledge and Skills": { + "count": 46, + "num_samples": 897, + "tasks": [], + "average_score": 0.24367762339842414 + }, + "Spatial and Temporal Reasoning": { + "count": 78, + "num_samples": 1273, + "tasks": [], + "average_score": 0.2134151671467958 + }, + "Planning and Decision Making": { + "count": 23, + "num_samples": 356, + "tasks": [], + "average_score": 0.07666140459493773 + }, + "Commonsense and Social Reasoning": { + "count": 38, + "num_samples": 654, + "tasks": [], + "average_score": 0.4322205869643684 + }, + "Ethical and Safety Reasoning": { + "count": 10, + "num_samples": 170, + "tasks": [], + "average_score": 0.5779849624060149 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 67, + "num_samples": 1123, + "tasks": [], + "average_score": 0.1368238769607056 + }, + "Text-Based Images and Documents": { + "count": 53, + "num_samples": 847, + "tasks": [], + "average_score": 0.1159417852705533 + }, + "Photographs": { + "count": 83, + "num_samples": 1315, + "tasks": [], + "average_score": 0.41346984169922946 + }, + "Diagrams and Data Visualizations": { + "count": 88, + "num_samples": 1524, + "tasks": [], + "average_score": 0.252511232938778 + }, + "Artistic and Creative Content": { + "count": 22, + "num_samples": 389, + "tasks": [], + "average_score": 0.39584024260311845 + }, + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.1783310257200802 + } + }, + "output_format": { + "structured_output": { + "count": 72, + "num_samples": 1121, + "tasks": [], + "average_score": 0.19236693222061413 + }, + "contextual_formatted_text": { + "count": 63, + "num_samples": 975, + "tasks": [], + "average_score": 0.19203135933620985 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.2490174433570946 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.22047389017098817 + }, + "open_ended_output": { + "count": 51, + "num_samples": 991, + "tasks": [], + "average_score": 0.3490743804978922 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.43553281735099914 + } + }, + "input_num": { + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.25687697499702805 + } + }, + "app": { + "Information_Extraction": { + "count": 41, + "num_samples": 644, + "tasks": [], + "average_score": 0.1340041159644947 + }, + "Planning": { + "count": 44, + "num_samples": 714, + "tasks": [], + "average_score": 0.10542024755948716 + }, + "Coding": { + "count": 16, + "num_samples": 244, + "tasks": [], + "average_score": 0.20031585550887018 + }, + "Perception": { + "count": 82, + "num_samples": 1321, + "tasks": [], + "average_score": 0.3545352938542377 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.5126984126984127 + }, + "Science": { + "count": 22, + "num_samples": 469, + "tasks": [], + "average_score": 0.27440171167785654 + }, + "Knowledge": { + "count": 77, + "num_samples": 1294, + "tasks": [], + "average_score": 0.32565632074201306 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.19520567001898761 + } + } + } +} \ No newline at end of file diff --git a/static/eval_results/SI/llava_onevision_7B/task_results.json b/static/eval_results/SI/llava_onevision_7B/task_results.json new file mode 100644 index 0000000000000000000000000000000000000000..dbab1adf3b34fe5c4cd1fe0d5f48efeb76f3eacc --- /dev/null +++ b/static/eval_results/SI/llava_onevision_7B/task_results.json @@ -0,0 +1,2207 @@ +[ + { + "name": "science_molecule_chemistry", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "signboard_identification", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "funsd_document_qa", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "physical_property_reasoning", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_area", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_analytic", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "3d_indoor_scene_text_bbox_selection", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "map_diagram_qa", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_connectivity", + "score": 0.16666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "graph_isomorphism", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "question_solution_solving", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "snli_ve_visual_entailment", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "ti_fused_vqa_chemistry", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ili_ratio_future_prediction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "image_style_recognition", + "score": 0.9333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "multilingual_movie_info_parsing", + "score": 0.11224489795918366, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_transformation", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "vibe_eval_short_phrase", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "math_breakpoint", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "waybill_number_sequence_extraction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "brand_logo_recognition_and_elaboration", + "score": 0.54, + "eval_type": "rule", + "num_demo": 1, + "num_query": 25 + }, + { + "name": "license_plate_recognition", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_puzzle_single_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "code_programming_test_easy", + "score": 0.041666666666666664, + "eval_type": "rule", + "num_demo": 1, + "num_query": 24 + }, + { + "name": "chess_winner_identification", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "chart_vqa", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "math_convexity_value_estimation", + "score": 0.24958220225240815, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "3d_indoor_scene_text_bbox_prediction", + "score": 0.07094776572587472, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "long_string_letter_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "dvqa", + "score": 0.42105263157894735, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "egocentric_spatial_reasoning", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "quizlet_question_solving", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "coco_person_detection", + "score": 0.5626373174966284, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_maxflow", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "tqa_textbook_qa", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "game_info_parsing", + "score": 0.2727272727272727, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "animal_pose_estimation", + "score": 0.21324372091628846, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "humor_understand_caption_match", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "table_understanding", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "super_clevr", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "average_humidity_estimate_plot", + "score": 0.09466666666666665, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "nlvr2_two_image_compare_qa", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "face_keypoint_detection", + "score": 0.848854419078294, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "insect_order_classification", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "deciphering_oracle_bone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "places365_scene_type_classification", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "science_basic_physics", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "long_string_number_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "handwritten_math_expression_extraction", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "newspaper_ocr_in_query_box", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "graph_theory", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "landmark_recognition_and_qa", + "score": 0.24444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "geometry_solid", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "top_rated_hotel_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "stock_info_parsing", + "score": 0.046218487394957986, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pmc_vqa_medical_image_qa", + "score": 0.631578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "electricity_future_prediction_from_table", + "score": 0.3899999999999999, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "TV_show_info_parsing", + "score": 0.30158730158730157, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "coco_object_detection_by_query_property", + "score": 0.4755460318827545, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "figureqa", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ti_fused_vqa_biology", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "traffic_future_prediction_from_line_plot", + "score": 0.4832631578947368, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "mvsa_sentiment_classification", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "stock_price_future_prediction", + "score": 0.12864285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "kvqa_knowledge_aware_qa", + "score": 0.631578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "hotel_booking_confirmation_parsing", + "score": 0.0071428571428571435, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_shortest_path_planar", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "math_parity", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "famous_building_recognition", + "score": 0.40625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "multilingual_game_info_parsing", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "weather_info_parsing", + "score": 0.17063492063492064, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_shortest_path_kamada_kawai", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "widerface_face_count_and_event_classification", + "score": 0.4642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ti_fused_vqa_physics", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_chordless_cycle", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_length", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "algebra", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "human_relationship_reasoning", + "score": 0.75, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "exchange_rate_estimate_plot", + "score": 0.36628571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "electricity_load_estimate_plot", + "score": 0.37007142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "single_person_pose_estimation", + "score": 0.11252203631219156, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ti_fused_vqa_math", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "electricity_plot_future_prediction", + "score": 0.4404578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "iconqa", + "score": 0.5263157894736842, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "movie_info_parsing", + "score": 0.16071428571428573, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "youtube_video_info_parsing", + "score": 0.05952380952380952, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "egocentric_analysis_single_image", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "music_info_parsing", + "score": 0.13392857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mnist_pattern", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "newspaper_page_parse_and_count", + "score": 0.35555555555555557, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "geometry_descriptive", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_relation", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_depth", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_distance", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cvbench_adapted_cvbench_count", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "symbolic_graphics_programs_computer_aided_design", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "symbolic_graphics_programs_scalable_vector_graphics", + "score": 0.16666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18 + }, + { + "name": "multiple_states_identify_asia", + "score": 0.028571428571428574, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_americas", + "score": 0.1, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_europe", + "score": 0.014285714285714287, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_africa", + "score": 0.014285714285714287, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "flowchart_code_generation", + "score": 0.5555555555555556, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "healthcare_info_judgement", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "relative_depth_of_different_points", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_visual_storage", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "Ad_count_detection", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_sygyzy_endgames", + "score": 0.09455782312925169, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "web_action_grounding", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "position_relationship", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "GUI_Act_Web_Multi", + "score": 0.39285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "webpage_code_understanding", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "extract_webpage_headline", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "music_sheet_note_count", + "score": 0.058823529411764705, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "geometry_reasoning_circled_letter", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "logical_reasoning_find_odd_one_out", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "paper_vqa", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "remaining_playback_time_calculation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "interpret_force_perspective_illusion", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "GUI_Act_Web_Single", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "hashtag_recommendation", + "score": 0.8642857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_tyreworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "weather_map_climate_type_temperature_parsing", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "scibench_calculus_wo_solution", + "score": 0.061224489795918366, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49 + }, + { + "name": "MMMU_physics_chemistry_MCQ", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "image_translation_en2cn", + "score": 0.09927221295148407, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "game_platform_support_identification", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "music_sheet_format_QA", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cultural_vqa", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "CLEVRER_physics", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20 + }, + { + "name": "location_vqa", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_nested_squares", + "score": 0.10714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "circuit_diagram_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "scibench_fundamental_wo_solution", + "score": 0.10204081632653061, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49 + }, + { + "name": "medical_cell_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "planning_screenshot_blocksworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "planning_screenshot_storage", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "distinguish_ai_generated_image", + "score": 0.631578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "geometry_reasoning_count_line_intersections", + "score": 0.5357142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_grippers", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "llavaguard", + "score": 0.32142857142857145, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mindmap_elements_parsing", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "autonomous_driving_scene_analysis", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "realworld_qa_en2cn", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "orchestra_score_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "highest_discount_game_price_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "counting", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "media_QA_web_stackoverflow", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "soccer_offside", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "song_title_identification_from_lyrics", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMMU_pro_exam_screenshot", + "score": 0.13131313131313133, + "eval_type": "rule", + "num_demo": 1, + "num_query": 99 + }, + { + "name": "medical_multi_organ_segmentation_rater", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "knowledge_graph_understanding", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "mensa_iq_test", + "score": 0.17107843137254902, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "play_go_capture_stone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "medical_blood_vessels_recognition", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "web_action_prediction", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "arxiv_vqa", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "top_video_creator_identification", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "annoying_word_search", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "GUI_Act_Mobile_tap", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_grid", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "GUI_Act_Mobile_swipe", + "score": 0.5569127613427832, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "recover_masked_word_in_figure", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "character_recognition_in_TV_shows", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "relative_reflectance_of_different_regions", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_image_artifacts_indentification", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "signage_navigation", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "transit_map_intersection_points", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "code_execution", + "score": 0.0625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "icon_arithmetic_puzzle", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_overlapped_circle", + "score": 0.32142857142857145, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chinese_idiom_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "monthly_weather_days_count", + "score": 0.2380952380952381, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "calendar_schedule_suggestion", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ascii_art_understanding", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_find_legal_moves", + "score": 0.03317029264010414, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "topological_sort", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "entertainment_web_game_style", + "score": 0.17857142857142858, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "polygon_interior_angles", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mahjong", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "font_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "number_comparison", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_counting_lymphocytes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "actor_recognition_in_Movie", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "product_ocr_qa", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "Bongard_Problem", + "score": 0.15789473684210525, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "logical_reasoning_fit_pattern", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "road_map_find_highway_between_two_place", + "score": 0.23529411764705882, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "ishihara_test", + "score": 0.4571428571428572, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "LaTeX_complex_formula_convertion", + "score": 0.058823529411764705, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "rebus", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 23 + }, + { + "name": "constrained_generation_contain_position_length", + "score": 0.2, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_repeat_length", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_contain_length", + "score": 0.8, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "constrained_generation_contain_length", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "chess_puzzles_equality", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "chess_puzzles_crushing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_puzzles_checkmate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "panel_images_single_question", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "panel_images_multi_question", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "crossword_mini_5x5", + "score": 0.0071428571428571435, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_HatefulMemes", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Misinformation_GossipCop", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Misinformation_PolitiFact", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Memotion", + "score": 0.6470588235294118, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "reward_models_I2T_reward", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_MATH", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "ocr_math_TheoremQA", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "counterfactual_arithmetic", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_homepage", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_publication", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_blogpost", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_complex_question_answering", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_fact_verification", + "score": 0.619047619047619, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_face_swap", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_text_style", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_clip_stable_diffusion_generate", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_veracity", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_face_attribute_edit", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_text_entity_replace", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_out_of_context", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MFC_Bench_check_background_change", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "number_puzzle_kakuro_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "number_puzzle_sudoku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "maze_2d_8x8", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "shape_composition_shapes", + "score": 0.17729591836734696, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "shape_composition_colours", + "score": 0.3034297052154195, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_chinese_celebrity", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_indian_celebrity", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_papers", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "memorization_famous_treaty", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_csv", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_school_plain", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_text_latex", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_skill_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_experience_plain", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_article_authors", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_markdown", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_html", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_employer_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_equation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_article_journal", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_latex", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "autorater_artifact", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "autorater_artifact_reason", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_haiku", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_acrostic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_limerick", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_petrarchian_sonnet_optional_meter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_custom_rhyming_scheme", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_acrostic_alliteration", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_shakespearean_sonnet", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "pictionary_cartoon_drawing_guess", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_doodle_guess", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "pictionary_skribbl_io", + "score": 0.15, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20 + }, + { + "name": "pictionary_genai_output_chinese", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_chinese_food_img2en", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_tiktok", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_notes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_alipay", + "score": 0.11764705882352941, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "app_layout_understanding_amazon", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_instagram", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_ppt", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_leetcode", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_twitter", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_zoom", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "app_layout_understanding_iphone_settings", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_youtube", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_excel", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_word", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "scibench_w_solution_open_ended", + "score": 0.15999999999999998, + "eval_type": "llm", + "num_demo": 1, + "num_query": 25 + }, + { + "name": "electrocardiogram", + "score": 0.27142857142857146, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "funny_image_title", + "score": 0.5642857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "image_captioning_with_additional_requirements", + "score": 0.6142857142857142, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "meme_explain", + "score": 0.6071428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "image_humor_understanding", + "score": 0.5896551724137932, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "tweets_captioning", + "score": 0.5499999999999999, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "defeasible_reasoning", + "score": 0.5344827586206896, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "vibe-eval", + "score": 0.2928571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocrqa", + "score": 0.5172413793103449, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "figurative_speech_explanation", + "score": 0.6275862068965519, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "docci_image_description_long", + "score": 0.6714285714285715, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bar_chart_interpretation", + "score": 0.35172413793103446, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "iq_test", + "score": 0.30344827586206885, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "unusual_images", + "score": 0.5172413793103448, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "GUI_Chat_Hard", + "score": 0.5161290322580645, + "eval_type": "llm", + "num_demo": 1, + "num_query": 31 + }, + { + "name": "graph_interpretation", + "score": 0.21724137931034485, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "traffic_accident_analysis", + "score": 0.7071428571428573, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "humor_explanation", + "score": 0.5399999999999999, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "GUI_Chat_Easy", + "score": 0.5384615384615384, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26 + }, + { + "name": "table2latex_complex", + "score": 0.2888888888888889, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "visualization_with_code", + "score": 0.29285714285714287, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "science_figure_explanation", + "score": 0.4620689655172414, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "guess_image_generation_prompt", + "score": 0.7421052631578948, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "art_explanation", + "score": 0.33103448275862063, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "bridge_strategies_worldclass", + "score": 0.0071428571428571435, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bridge_strategies_expert", + "score": 0.4428571428571428, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bridge_strategies_advanced", + "score": 0.03571428571428571, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Spanish", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Arabic", + "score": 0.028571428571428574, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Japanese", + "score": 0.021428571428571432, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Russian", + "score": 0.014285714285714287, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_English", + "score": 0.028571428571428574, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_French", + "score": 0.014285714285714287, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_fetaqa", + "score": 0.1928571428571429, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "red_teaming_racial", + "score": 0.7300000000000001, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_captcha", + "score": 0.10000000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "red_teaming_celebrity", + "score": 0.6150000000000002, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_politics", + "score": 0.655, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_visualmisleading", + "score": 0.8684210526315791, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "red_teaming_jailbreak", + "score": 0.555, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "ascii_art_30", + "score": 0.14285714285714285, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + } +] \ No newline at end of file diff --git a/utils.py b/utils.py index c1e6d15789aaf7b0b3bec09ced62662de9ef6759..a33c572a63f4bd4b13e5ceeb7b3636fde093d31d 100644 --- a/utils.py +++ b/utils.py @@ -1,6 +1,7 @@ import pandas as pd import json from typing import Dict, Any, Tuple +import os # Keep all the constant mappings outside the class MODEL_NAME_MAP = { @@ -116,6 +117,8 @@ MODEL_URLS = { "POINTS_15_7B": "https://huggingface.co./WePOINTS/POINTS-1-5-Qwen-2-5-7B-Chat", "SmolVLM": "https://huggingface.co./HuggingFaceTB/SmolVLM-Instruct", "Mammoth_VL": "https://huggingface.co./MAmmoTH-VL/MAmmoTH-VL-8B", + "InternVL2_5_78B": "https://huggingface.co./OpenGVLab/InternVL2_5-78B", + "InternVL2_5_2B": "https://huggingface.co./OpenGVLab/InternVL2_5-2B", } class BaseDataLoader: @@ -190,52 +193,37 @@ class BaseDataLoader: def _load_summary_data(self) -> Dict[str, Any]: raise NotImplementedError("Subclasses must implement _load_summary_data") - def get_df(self, selected_super_group: str, selected_model_group: str) -> pd.DataFrame: - raise NotImplementedError("Subclasses must implement get_df") - - def get_leaderboard_data(self, selected_super_group: str, selected_model_group: str) -> Tuple[list, list]: - raise NotImplementedError("Subclasses must implement get_leaderboard_data") - - -class DefaultDataLoader(BaseDataLoader): - def __init__(self): - super().__init__() - - def _load_model_data(self) -> Dict[str, Any]: - with open("./static/eval_results/Default/all_model_keywords_stats.json", "r") as f: - return json.load(f) - - def _load_summary_data(self) -> Dict[str, Any]: - with open("./static/eval_results/Default/all_summary.json", "r") as f: - return json.load(f) - def get_df(self, selected_super_group: str, selected_model_group: str) -> pd.DataFrame: original_dimension = get_original_dimension(selected_super_group) data = [] + for model in self.MODEL_GROUPS[selected_model_group]: + if model not in self.MODEL_DATA or model not in self.SUMMARY_DATA: + continue + model_data = self.MODEL_DATA[model] summary = self.SUMMARY_DATA[model] - if summary["core_noncot"]: - core_noncot_score = summary["core_noncot"]["macro_mean_score"] - else: - core_noncot_score = '-' - if summary["core_cot"]: - core_cot_score = summary["core_cot"]["macro_mean_score"] - else: - core_cot_score = '-' + + # Basic model information row = { "Models": get_display_model_name(model, as_link=True), "Overall": round(summary["overall_score"] * 100, 2), - "Core w/o CoT": round(core_noncot_score * 100, 2) if core_noncot_score != '-' else '-', - "Core w/ CoT": round(core_cot_score * 100, 2) if core_cot_score != '-' else '-', - "Open-ended": round(summary["open"]["macro_mean_score"] * 100, 2) if summary["open"] else '-' + "Core": round(summary["core"]["macro_mean_score"] * 100, 2), + "Open-ended": round(summary["open"]["macro_mean_score"] * 100, 2) } - for display_name in self.SUPER_GROUPS[selected_super_group]: - original_keyword = self.keyword_display_map[display_name] - if original_dimension in model_data and original_keyword in model_data[original_dimension]: - row[display_name] = round(model_data[original_dimension][original_keyword]["average_score"] * 100, 2) - else: + + # Add dimension-specific scores + if original_dimension in model_data: + for display_name in self.SUPER_GROUPS[selected_super_group]: + original_keyword = self.keyword_display_map[display_name] + if original_keyword in model_data[original_dimension]: + row[display_name] = round(model_data[original_dimension][original_keyword]["average_score"] * 100, 2) + else: + row[display_name] = None + else: + for display_name in self.SUPER_GROUPS[selected_super_group]: row[display_name] = None + data.append(row) df = pd.DataFrame(data) @@ -246,8 +234,8 @@ class DefaultDataLoader(BaseDataLoader): df = self.get_df(selected_super_group, selected_model_group) # Get total task counts from the first model's data - sample_model = next(iter(self.MODEL_DATA)) - total_core_tasks = self.SUMMARY_DATA[sample_model]["core_noncot"]["num_eval_tasks"] + sample_model = next(iter(self.SUMMARY_DATA)) + total_core_tasks = self.SUMMARY_DATA[sample_model]["core"]["num_eval_tasks"] total_open_tasks = self.SUMMARY_DATA[sample_model]["open"]["num_eval_tasks"] total_tasks = total_core_tasks + total_open_tasks @@ -255,8 +243,7 @@ class DefaultDataLoader(BaseDataLoader): column_headers = { "Models": "Models", "Overall": f"Overall({total_tasks})", - "Core w/o CoT": f"Core w/o CoT({total_core_tasks})", - "Core w/ CoT": f"Core w/ CoT({total_core_tasks})", + "Core": f"Core({total_core_tasks})", "Open-ended": f"Open-ended({total_open_tasks})" } @@ -266,93 +253,96 @@ class DefaultDataLoader(BaseDataLoader): headers = [ column_headers["Models"], column_headers["Overall"], - column_headers["Core w/o CoT"], - column_headers["Core w/ CoT"], + column_headers["Core"], column_headers["Open-ended"] ] + self.SUPER_GROUPS[selected_super_group] data = df[[ column_headers["Models"], column_headers["Overall"], - column_headers["Core w/o CoT"], - column_headers["Core w/ CoT"], + column_headers["Core"], column_headers["Open-ended"] ] + self.SUPER_GROUPS[selected_super_group]].values.tolist() return headers, data -class SingleImageDataLoader(BaseDataLoader): +class DefaultDataLoader(BaseDataLoader): def __init__(self): super().__init__() def _load_model_data(self) -> Dict[str, Any]: - with open("./static/eval_results/SI/all_model_keywords_stats.json", "r") as f: - return json.load(f) - - def _load_summary_data(self) -> Dict[str, Any]: - with open("./static/eval_results/SI/all_summary.json", "r") as f: - return json.load(f) - - def get_df(self, selected_super_group: str, selected_model_group: str) -> pd.DataFrame: - original_dimension = get_original_dimension(selected_super_group) - data = [] - for model in self.MODEL_GROUPS[selected_model_group]: - model_data = self.MODEL_DATA[model] - summary = self.SUMMARY_DATA[model] - row = { - "Models": get_display_model_name(model, as_link=True), - "Overall": round(summary["overall_score"] * 100, 2), - "Core": round(summary["core"]["macro_mean_score"] * 100, 2), - "Open-ended": round(summary["open"]["macro_mean_score"] * 100, 2) - } - for display_name in self.SUPER_GROUPS[selected_super_group]: - original_keyword = self.keyword_display_map[display_name] - if original_dimension in model_data and original_keyword in model_data[original_dimension]: - row[display_name] = round(model_data[original_dimension][original_keyword]["average_score"] * 100, 2) - else: - row[display_name] = None - data.append(row) + model_data = {} + base_path = "./static/eval_results/Default" - df = pd.DataFrame(data) - df = df.sort_values(by="Overall", ascending=False) - return df + try: + model_folders = [f for f in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, f))] + for model_name in model_folders: + model_path = f"{base_path}/{model_name}/summary_results.json" + with open(model_path, "r") as f: + data = json.load(f) + if "keyword_stats" in data: + model_data[model_name] = data["keyword_stats"] + except FileNotFoundError: + pass + + return model_data - def get_leaderboard_data(self, selected_super_group: str, selected_model_group: str) -> Tuple[list, list]: - df = self.get_df(selected_super_group, selected_model_group) + def _load_summary_data(self) -> Dict[str, Any]: + summary_data = {} + base_path = "./static/eval_results/Default" - # Get total task counts from the first model's data - sample_model = next(iter(self.MODEL_DATA)) - total_core_tasks = self.SUMMARY_DATA[sample_model]["core"]["num_eval_tasks"] - total_open_tasks = self.SUMMARY_DATA[sample_model]["open"]["num_eval_tasks"] - total_tasks = total_core_tasks + total_open_tasks + try: + model_folders = [f for f in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, f))] + for model_name in model_folders: + model_path = f"{base_path}/{model_name}/summary_results.json" + with open(model_path, "r") as f: + data = json.load(f) + if "model_summary" in data: + summary_data[model_name] = data["model_summary"] + except FileNotFoundError: + pass - # Define headers with task counts - column_headers = { - "Models": "Models", - "Overall": f"Overall({total_tasks})", - "Core": f"Core({total_core_tasks})", - "Open-ended": f"Open-ended({total_open_tasks})" - } + return summary_data + + +class SingleImageDataLoader(BaseDataLoader): + def __init__(self): + super().__init__() + + def _load_model_data(self) -> Dict[str, Any]: + model_data = {} + base_path = "./static/eval_results/SI" - # Rename the columns in DataFrame to match headers - df = df.rename(columns=column_headers) + try: + model_folders = [f for f in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, f))] + for model_name in model_folders: + model_path = f"{base_path}/{model_name}/summary_results.json" + with open(model_path, "r") as f: + data = json.load(f) + if "keyword_stats" in data: + model_data[model_name] = data["keyword_stats"] + except FileNotFoundError: + pass - headers = [ - column_headers["Models"], - column_headers["Overall"], - column_headers["Core"], - column_headers["Open-ended"] - ] + self.SUPER_GROUPS[selected_super_group] + return model_data + + def _load_summary_data(self) -> Dict[str, Any]: + summary_data = {} + base_path = "./static/eval_results/SI" - data = df[[ - column_headers["Models"], - column_headers["Overall"], - column_headers["Core"], - column_headers["Open-ended"] - ] + self.SUPER_GROUPS[selected_super_group]].values.tolist() + try: + model_folders = [f for f in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, f))] + for model_name in model_folders: + model_path = f"{base_path}/{model_name}/summary_results.json" + with open(model_path, "r") as f: + data = json.load(f) + if "model_summary" in data: + summary_data[model_name] = data["model_summary"] + except FileNotFoundError: + pass - return headers, data + return summary_data # Keep your helper functions @@ -367,3 +357,4 @@ def get_display_model_name(model_name: str, as_link: bool = True) -> str: if as_link and model_name in MODEL_URLS: return f'{display_name}' return display_name +